From 68c7e618d59d3d841cab1febdd77556e5c1faa78 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 11 Dec 2015 09:19:49 -0800 Subject: [PATCH] 4.2-stable patches added patches: alsa-hda-hdmi-apply-skylake-fix-ups-to-broxton-display-codec.patch btrfs-check-unsupported-filters-in-balance-arguments.patch btrfs-fix-file-corruption-and-data-loss-after-cloning-inline-extents.patch btrfs-fix-race-leading-to-bug_on-when-running-delalloc-for-nodatacow.patch btrfs-fix-race-leading-to-incorrect-item-deletion-when-dropping-extents.patch btrfs-fix-race-when-listing-an-inode-s-xattrs.patch btrfs-fix-regression-when-running-delayed-references.patch btrfs-fix-resending-received-snapshot-with-parent.patch btrfs-fix-signed-overflows-in-btrfs_sync_file.patch btrfs-fix-truncation-of-compressed-and-inlined-extents.patch ceph-fix-message-length-computation.patch cobalt-fix-kconfig-dependency.patch debugfs-fix-refcount-imbalance-in-start_creating.patch ext4-crypto-fix-bugs-in-ext4_encrypted_zeroout.patch ext4-crypto-fix-memory-leak-in-ext4_bio_write_page.patch ext4-crypto-replace-some-bug_on-s-with-error-checks.patch ext4-fix-potential-use-after-free-in-__ext4_journal_stop.patch ext4-jbd2-ensure-entering-into-panic-after-recording-an-error-in-superblock.patch firewire-ohci-fix-jmicron-jmb38x-it-context-discovery.patch nfs-if-we-have-no-valid-attrs-then-don-t-declare-the-attribute-cache-valid.patch nfs4-limit-callback-decoding-to-received-bytes.patch nfs4-start-callback_ident-at-idr-1.patch nfsd-eliminate-sending-duplicate-and-repeated-delegations.patch nfsd-serialize-state-seqid-morphing-operations.patch ocfs2-fix-umask-ignored-issue.patch rbd-don-t-put-snap_context-twice-in-rbd_queue_workfn.patch --- ...ake-fix-ups-to-broxton-display-codec.patch | 33 ++ ...pported-filters-in-balance-arguments.patch | 57 +++ ...ta-loss-after-cloning-inline-extents.patch | 443 ++++++++++++++++++ ...-when-running-delalloc-for-nodatacow.patch | 122 +++++ ...-item-deletion-when-dropping-extents.patch | 198 ++++++++ ...-race-when-listing-an-inode-s-xattrs.patch | 92 ++++ ...sion-when-running-delayed-references.patch | 307 ++++++++++++ ...ending-received-snapshot-with-parent.patch | 76 +++ ...-signed-overflows-in-btrfs_sync_file.patch | 66 +++ ...on-of-compressed-and-inlined-extents.patch | 288 ++++++++++++ .../ceph-fix-message-length-computation.patch | 37 ++ queue-4.2/cobalt-fix-kconfig-dependency.patch | 55 +++ ...refcount-imbalance-in-start_creating.patch | 47 ++ ...o-fix-bugs-in-ext4_encrypted_zeroout.patch | 90 ++++ ...x-memory-leak-in-ext4_bio_write_page.patch | 51 ++ ...lace-some-bug_on-s-with-error-checks.patch | 101 ++++ ...se-after-free-in-__ext4_journal_stop.patch | 44 ++ ...ter-recording-an-error-in-superblock.patch | 104 ++++ ...-jmicron-jmb38x-it-context-discovery.patch | 71 +++ ...-t-declare-the-attribute-cache-valid.patch | 39 ++ ...-callback-decoding-to-received-bytes.patch | 97 ++++ .../nfs4-start-callback_ident-at-idr-1.patch | 33 ++ ...g-duplicate-and-repeated-delegations.patch | 200 ++++++++ ...lize-state-seqid-morphing-operations.patch | 207 ++++++++ queue-4.2/ocfs2-fix-umask-ignored-issue.patch | 36 ++ ...ap_context-twice-in-rbd_queue_workfn.patch | 34 ++ queue-4.2/series | 26 + 27 files changed, 2954 insertions(+) create mode 100644 queue-4.2/alsa-hda-hdmi-apply-skylake-fix-ups-to-broxton-display-codec.patch create mode 100644 queue-4.2/btrfs-check-unsupported-filters-in-balance-arguments.patch create mode 100644 queue-4.2/btrfs-fix-file-corruption-and-data-loss-after-cloning-inline-extents.patch create mode 100644 queue-4.2/btrfs-fix-race-leading-to-bug_on-when-running-delalloc-for-nodatacow.patch create mode 100644 queue-4.2/btrfs-fix-race-leading-to-incorrect-item-deletion-when-dropping-extents.patch create mode 100644 queue-4.2/btrfs-fix-race-when-listing-an-inode-s-xattrs.patch create mode 100644 queue-4.2/btrfs-fix-regression-when-running-delayed-references.patch create mode 100644 queue-4.2/btrfs-fix-resending-received-snapshot-with-parent.patch create mode 100644 queue-4.2/btrfs-fix-signed-overflows-in-btrfs_sync_file.patch create mode 100644 queue-4.2/btrfs-fix-truncation-of-compressed-and-inlined-extents.patch create mode 100644 queue-4.2/ceph-fix-message-length-computation.patch create mode 100644 queue-4.2/cobalt-fix-kconfig-dependency.patch create mode 100644 queue-4.2/debugfs-fix-refcount-imbalance-in-start_creating.patch create mode 100644 queue-4.2/ext4-crypto-fix-bugs-in-ext4_encrypted_zeroout.patch create mode 100644 queue-4.2/ext4-crypto-fix-memory-leak-in-ext4_bio_write_page.patch create mode 100644 queue-4.2/ext4-crypto-replace-some-bug_on-s-with-error-checks.patch create mode 100644 queue-4.2/ext4-fix-potential-use-after-free-in-__ext4_journal_stop.patch create mode 100644 queue-4.2/ext4-jbd2-ensure-entering-into-panic-after-recording-an-error-in-superblock.patch create mode 100644 queue-4.2/firewire-ohci-fix-jmicron-jmb38x-it-context-discovery.patch create mode 100644 queue-4.2/nfs-if-we-have-no-valid-attrs-then-don-t-declare-the-attribute-cache-valid.patch create mode 100644 queue-4.2/nfs4-limit-callback-decoding-to-received-bytes.patch create mode 100644 queue-4.2/nfs4-start-callback_ident-at-idr-1.patch create mode 100644 queue-4.2/nfsd-eliminate-sending-duplicate-and-repeated-delegations.patch create mode 100644 queue-4.2/nfsd-serialize-state-seqid-morphing-operations.patch create mode 100644 queue-4.2/ocfs2-fix-umask-ignored-issue.patch create mode 100644 queue-4.2/rbd-don-t-put-snap_context-twice-in-rbd_queue_workfn.patch diff --git a/queue-4.2/alsa-hda-hdmi-apply-skylake-fix-ups-to-broxton-display-codec.patch b/queue-4.2/alsa-hda-hdmi-apply-skylake-fix-ups-to-broxton-display-codec.patch new file mode 100644 index 00000000000..542db357766 --- /dev/null +++ b/queue-4.2/alsa-hda-hdmi-apply-skylake-fix-ups-to-broxton-display-codec.patch @@ -0,0 +1,33 @@ +From e2656412f2a7343ecfd13eb74bac0a6e6e9c5aad Mon Sep 17 00:00:00 2001 +From: "Lu, Han" +Date: Wed, 11 Nov 2015 16:54:27 +0800 +Subject: ALSA: hda/hdmi - apply Skylake fix-ups to Broxton display codec + +From: "Lu, Han" + +commit e2656412f2a7343ecfd13eb74bac0a6e6e9c5aad upstream. + +Broxton and Skylake have the same behavior on display audio. So this patch +applys Skylake fix-ups to Broxton. + +Signed-off-by: Lu, Han +Signed-off-by: Takashi Iwai +Signed-off-by: Greg Kroah-Hartman + +--- + sound/pci/hda/patch_hdmi.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/sound/pci/hda/patch_hdmi.c ++++ b/sound/pci/hda/patch_hdmi.c +@@ -48,8 +48,9 @@ MODULE_PARM_DESC(static_hdmi_pcm, "Don't + #define is_haswell(codec) ((codec)->core.vendor_id == 0x80862807) + #define is_broadwell(codec) ((codec)->core.vendor_id == 0x80862808) + #define is_skylake(codec) ((codec)->core.vendor_id == 0x80862809) ++#define is_broxton(codec) ((codec)->core.vendor_id == 0x8086280a) + #define is_haswell_plus(codec) (is_haswell(codec) || is_broadwell(codec) \ +- || is_skylake(codec)) ++ || is_skylake(codec) || is_broxton(codec)) + + #define is_valleyview(codec) ((codec)->core.vendor_id == 0x80862882) + #define is_cherryview(codec) ((codec)->core.vendor_id == 0x80862883) diff --git a/queue-4.2/btrfs-check-unsupported-filters-in-balance-arguments.patch b/queue-4.2/btrfs-check-unsupported-filters-in-balance-arguments.patch new file mode 100644 index 00000000000..578a3e92041 --- /dev/null +++ b/queue-4.2/btrfs-check-unsupported-filters-in-balance-arguments.patch @@ -0,0 +1,57 @@ +From 849ef9286f30c88113906dc35f44a499c0cb385d Mon Sep 17 00:00:00 2001 +From: David Sterba +Date: Mon, 12 Oct 2015 16:55:54 +0200 +Subject: btrfs: check unsupported filters in balance arguments + +From: David Sterba + +commit 849ef9286f30c88113906dc35f44a499c0cb385d upstream. + +We don't verify that all the balance filter arguments supplemented by +the flags are actually known to the kernel. Thus we let it silently pass +and do nothing. + +At the moment this means only the 'limit' filter, but we're going to add +a few more soon so it's better to have that fixed. Also in older stable +kernels so that it works with newer userspace tools. + +Signed-off-by: David Sterba +Signed-off-by: Chris Mason +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ioctl.c | 5 +++++ + fs/btrfs/volumes.h | 8 ++++++++ + 2 files changed, 13 insertions(+) + +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -4652,6 +4652,11 @@ locked: + goto out_bctl; + } + ++ if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) { ++ ret = -EINVAL; ++ goto out_bargs; ++ } ++ + do_balance: + /* + * Ownership of bctl and mutually_exclusive_operation_running +--- a/fs/btrfs/volumes.h ++++ b/fs/btrfs/volumes.h +@@ -384,6 +384,14 @@ struct map_lookup { + BTRFS_BALANCE_ARGS_VRANGE | \ + BTRFS_BALANCE_ARGS_LIMIT) + ++#define BTRFS_BALANCE_ARGS_MASK \ ++ (BTRFS_BALANCE_ARGS_PROFILES | \ ++ BTRFS_BALANCE_ARGS_USAGE | \ ++ BTRFS_BALANCE_ARGS_DEVID | \ ++ BTRFS_BALANCE_ARGS_DRANGE | \ ++ BTRFS_BALANCE_ARGS_VRANGE | \ ++ BTRFS_BALANCE_ARGS_LIMIT) ++ + /* + * Profile changing flags. When SOFT is set we won't relocate chunk if + * it already has the target profile (even though it may be diff --git a/queue-4.2/btrfs-fix-file-corruption-and-data-loss-after-cloning-inline-extents.patch b/queue-4.2/btrfs-fix-file-corruption-and-data-loss-after-cloning-inline-extents.patch new file mode 100644 index 00000000000..6ae44758b5f --- /dev/null +++ b/queue-4.2/btrfs-fix-file-corruption-and-data-loss-after-cloning-inline-extents.patch @@ -0,0 +1,443 @@ +From 8039d87d9e473aeb740d4fdbd59b9d2f89b2ced9 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Tue, 13 Oct 2015 15:15:00 +0100 +Subject: Btrfs: fix file corruption and data loss after cloning inline extents + +From: Filipe Manana + +commit 8039d87d9e473aeb740d4fdbd59b9d2f89b2ced9 upstream. + +Currently the clone ioctl allows to clone an inline extent from one file +to another that already has other (non-inlined) extents. This is a problem +because btrfs is not designed to deal with files having inline and regular +extents, if a file has an inline extent then it must be the only extent +in the file and must start at file offset 0. Having a file with an inline +extent followed by regular extents results in EIO errors when doing reads +or writes against the first 4K of the file. + +Also, the clone ioctl allows one to lose data if the source file consists +of a single inline extent, with a size of N bytes, and the destination +file consists of a single inline extent with a size of M bytes, where we +have M > N. In this case the clone operation removes the inline extent +from the destination file and then copies the inline extent from the +source file into the destination file - we lose the M - N bytes from the +destination file, a read operation will get the value 0x00 for any bytes +in the the range [N, M] (the destination inode's i_size remained as M, +that's why we can read past N bytes). + +So fix this by not allowing such destructive operations to happen and +return errno EOPNOTSUPP to user space. + +Currently the fstest btrfs/035 tests the data loss case but it totally +ignores this - i.e. expects the operation to succeed and does not check +the we got data loss. + +The following test case for fstests exercises all these cases that result +in file corruption and data loss: + + seq=`basename $0` + seqres=$RESULT_DIR/$seq + echo "QA output created by $seq" + tmp=/tmp/$$ + status=1 # failure is the default! + trap "_cleanup; exit \$status" 0 1 2 3 15 + + _cleanup() + { + rm -f $tmp.* + } + + # get standard environment, filters and checks + . ./common/rc + . ./common/filter + + # real QA test starts here + _need_to_be_root + _supported_fs btrfs + _supported_os Linux + _require_scratch + _require_cloner + _require_btrfs_fs_feature "no_holes" + _require_btrfs_mkfs_feature "no-holes" + + rm -f $seqres.full + + test_cloning_inline_extents() + { + local mkfs_opts=$1 + local mount_opts=$2 + + _scratch_mkfs $mkfs_opts >>$seqres.full 2>&1 + _scratch_mount $mount_opts + + # File bar, the source for all the following clone operations, consists + # of a single inline extent (50 bytes). + $XFS_IO_PROG -f -c "pwrite -S 0xbb 0 50" $SCRATCH_MNT/bar \ + | _filter_xfs_io + + # Test cloning into a file with an extent (non-inlined) where the + # destination offset overlaps that extent. It should not be possible to + # clone the inline extent from file bar into this file. + $XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 16K" $SCRATCH_MNT/foo \ + | _filter_xfs_io + $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo + + # Doing IO against any range in the first 4K of the file should work. + # Due to a past clone ioctl bug which allowed cloning the inline extent, + # these operations resulted in EIO errors. + echo "File foo data after clone operation:" + # All bytes should have the value 0xaa (clone operation failed and did + # not modify our file). + od -t x1 $SCRATCH_MNT/foo + $XFS_IO_PROG -c "pwrite -S 0xcc 0 100" $SCRATCH_MNT/foo | _filter_xfs_io + + # Test cloning the inline extent against a file which has a hole in its + # first 4K followed by a non-inlined extent. It should not be possible + # as well to clone the inline extent from file bar into this file. + $XFS_IO_PROG -f -c "pwrite -S 0xdd 4K 12K" $SCRATCH_MNT/foo2 \ + | _filter_xfs_io + $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo2 + + # Doing IO against any range in the first 4K of the file should work. + # Due to a past clone ioctl bug which allowed cloning the inline extent, + # these operations resulted in EIO errors. + echo "File foo2 data after clone operation:" + # All bytes should have the value 0x00 (clone operation failed and did + # not modify our file). + od -t x1 $SCRATCH_MNT/foo2 + $XFS_IO_PROG -c "pwrite -S 0xee 0 90" $SCRATCH_MNT/foo2 | _filter_xfs_io + + # Test cloning the inline extent against a file which has a size of zero + # but has a prealloc extent. It should not be possible as well to clone + # the inline extent from file bar into this file. + $XFS_IO_PROG -f -c "falloc -k 0 1M" $SCRATCH_MNT/foo3 | _filter_xfs_io + $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo3 + + # Doing IO against any range in the first 4K of the file should work. + # Due to a past clone ioctl bug which allowed cloning the inline extent, + # these operations resulted in EIO errors. + echo "First 50 bytes of foo3 after clone operation:" + # Should not be able to read any bytes, file has 0 bytes i_size (the + # clone operation failed and did not modify our file). + od -t x1 $SCRATCH_MNT/foo3 + $XFS_IO_PROG -c "pwrite -S 0xff 0 90" $SCRATCH_MNT/foo3 | _filter_xfs_io + + # Test cloning the inline extent against a file which consists of a + # single inline extent that has a size not greater than the size of + # bar's inline extent (40 < 50). + # It should be possible to do the extent cloning from bar to this file. + $XFS_IO_PROG -f -c "pwrite -S 0x01 0 40" $SCRATCH_MNT/foo4 \ + | _filter_xfs_io + $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo4 + + # Doing IO against any range in the first 4K of the file should work. + echo "File foo4 data after clone operation:" + # Must match file bar's content. + od -t x1 $SCRATCH_MNT/foo4 + $XFS_IO_PROG -c "pwrite -S 0x02 0 90" $SCRATCH_MNT/foo4 | _filter_xfs_io + + # Test cloning the inline extent against a file which consists of a + # single inline extent that has a size greater than the size of bar's + # inline extent (60 > 50). + # It should not be possible to clone the inline extent from file bar + # into this file. + $XFS_IO_PROG -f -c "pwrite -S 0x03 0 60" $SCRATCH_MNT/foo5 \ + | _filter_xfs_io + $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo5 + + # Reading the file should not fail. + echo "File foo5 data after clone operation:" + # Must have a size of 60 bytes, with all bytes having a value of 0x03 + # (the clone operation failed and did not modify our file). + od -t x1 $SCRATCH_MNT/foo5 + + # Test cloning the inline extent against a file which has no extents but + # has a size greater than bar's inline extent (16K > 50). + # It should not be possible to clone the inline extent from file bar + # into this file. + $XFS_IO_PROG -f -c "truncate 16K" $SCRATCH_MNT/foo6 | _filter_xfs_io + $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo6 + + # Reading the file should not fail. + echo "File foo6 data after clone operation:" + # Must have a size of 16K, with all bytes having a value of 0x00 (the + # clone operation failed and did not modify our file). + od -t x1 $SCRATCH_MNT/foo6 + + # Test cloning the inline extent against a file which has no extents but + # has a size not greater than bar's inline extent (30 < 50). + # It should be possible to clone the inline extent from file bar into + # this file. + $XFS_IO_PROG -f -c "truncate 30" $SCRATCH_MNT/foo7 | _filter_xfs_io + $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo7 + + # Reading the file should not fail. + echo "File foo7 data after clone operation:" + # Must have a size of 50 bytes, with all bytes having a value of 0xbb. + od -t x1 $SCRATCH_MNT/foo7 + + # Test cloning the inline extent against a file which has a size not + # greater than the size of bar's inline extent (20 < 50) but has + # a prealloc extent that goes beyond the file's size. It should not be + # possible to clone the inline extent from bar into this file. + $XFS_IO_PROG -f -c "falloc -k 0 1M" \ + -c "pwrite -S 0x88 0 20" \ + $SCRATCH_MNT/foo8 | _filter_xfs_io + $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo8 + + echo "File foo8 data after clone operation:" + # Must have a size of 20 bytes, with all bytes having a value of 0x88 + # (the clone operation did not modify our file). + od -t x1 $SCRATCH_MNT/foo8 + + _scratch_unmount + } + + echo -e "\nTesting without compression and without the no-holes feature...\n" + test_cloning_inline_extents + + echo -e "\nTesting with compression and without the no-holes feature...\n" + test_cloning_inline_extents "" "-o compress" + + echo -e "\nTesting without compression and with the no-holes feature...\n" + test_cloning_inline_extents "-O no-holes" "" + + echo -e "\nTesting with compression and with the no-holes feature...\n" + test_cloning_inline_extents "-O no-holes" "-o compress" + + status=0 + exit + +Signed-off-by: Filipe Manana +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ioctl.c | 195 ++++++++++++++++++++++++++++++++++++++++++------------- + 1 file changed, 152 insertions(+), 43 deletions(-) + +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -3320,6 +3320,150 @@ static void clone_update_extent_map(stru + &BTRFS_I(inode)->runtime_flags); + } + ++/* ++ * Make sure we do not end up inserting an inline extent into a file that has ++ * already other (non-inline) extents. If a file has an inline extent it can ++ * not have any other extents and the (single) inline extent must start at the ++ * file offset 0. Failing to respect these rules will lead to file corruption, ++ * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc ++ * ++ * We can have extents that have been already written to disk or we can have ++ * dirty ranges still in delalloc, in which case the extent maps and items are ++ * created only when we run delalloc, and the delalloc ranges might fall outside ++ * the range we are currently locking in the inode's io tree. So we check the ++ * inode's i_size because of that (i_size updates are done while holding the ++ * i_mutex, which we are holding here). ++ * We also check to see if the inode has a size not greater than "datal" but has ++ * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are ++ * protected against such concurrent fallocate calls by the i_mutex). ++ * ++ * If the file has no extents but a size greater than datal, do not allow the ++ * copy because we would need turn the inline extent into a non-inline one (even ++ * with NO_HOLES enabled). If we find our destination inode only has one inline ++ * extent, just overwrite it with the source inline extent if its size is less ++ * than the source extent's size, or we could copy the source inline extent's ++ * data into the destination inode's inline extent if the later is greater then ++ * the former. ++ */ ++static int clone_copy_inline_extent(struct inode *src, ++ struct inode *dst, ++ struct btrfs_trans_handle *trans, ++ struct btrfs_path *path, ++ struct btrfs_key *new_key, ++ const u64 drop_start, ++ const u64 datal, ++ const u64 skip, ++ const u64 size, ++ char *inline_data) ++{ ++ struct btrfs_root *root = BTRFS_I(dst)->root; ++ const u64 aligned_end = ALIGN(new_key->offset + datal, ++ root->sectorsize); ++ int ret; ++ struct btrfs_key key; ++ ++ if (new_key->offset > 0) ++ return -EOPNOTSUPP; ++ ++ key.objectid = btrfs_ino(dst); ++ key.type = BTRFS_EXTENT_DATA_KEY; ++ key.offset = 0; ++ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); ++ if (ret < 0) { ++ return ret; ++ } else if (ret > 0) { ++ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { ++ ret = btrfs_next_leaf(root, path); ++ if (ret < 0) ++ return ret; ++ else if (ret > 0) ++ goto copy_inline_extent; ++ } ++ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); ++ if (key.objectid == btrfs_ino(dst) && ++ key.type == BTRFS_EXTENT_DATA_KEY) { ++ ASSERT(key.offset > 0); ++ return -EOPNOTSUPP; ++ } ++ } else if (i_size_read(dst) <= datal) { ++ struct btrfs_file_extent_item *ei; ++ u64 ext_len; ++ ++ /* ++ * If the file size is <= datal, make sure there are no other ++ * extents following (can happen do to an fallocate call with ++ * the flag FALLOC_FL_KEEP_SIZE). ++ */ ++ ei = btrfs_item_ptr(path->nodes[0], path->slots[0], ++ struct btrfs_file_extent_item); ++ /* ++ * If it's an inline extent, it can not have other extents ++ * following it. ++ */ ++ if (btrfs_file_extent_type(path->nodes[0], ei) == ++ BTRFS_FILE_EXTENT_INLINE) ++ goto copy_inline_extent; ++ ++ ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei); ++ if (ext_len > aligned_end) ++ return -EOPNOTSUPP; ++ ++ ret = btrfs_next_item(root, path); ++ if (ret < 0) { ++ return ret; ++ } else if (ret == 0) { ++ btrfs_item_key_to_cpu(path->nodes[0], &key, ++ path->slots[0]); ++ if (key.objectid == btrfs_ino(dst) && ++ key.type == BTRFS_EXTENT_DATA_KEY) ++ return -EOPNOTSUPP; ++ } ++ } ++ ++copy_inline_extent: ++ /* ++ * We have no extent items, or we have an extent at offset 0 which may ++ * or may not be inlined. All these cases are dealt the same way. ++ */ ++ if (i_size_read(dst) > datal) { ++ /* ++ * If the destination inode has an inline extent... ++ * This would require copying the data from the source inline ++ * extent into the beginning of the destination's inline extent. ++ * But this is really complex, both extents can be compressed ++ * or just one of them, which would require decompressing and ++ * re-compressing data (which could increase the new compressed ++ * size, not allowing the compressed data to fit anymore in an ++ * inline extent). ++ * So just don't support this case for now (it should be rare, ++ * we are not really saving space when cloning inline extents). ++ */ ++ return -EOPNOTSUPP; ++ } ++ ++ btrfs_release_path(path); ++ ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1); ++ if (ret) ++ return ret; ++ ret = btrfs_insert_empty_item(trans, root, path, new_key, size); ++ if (ret) ++ return ret; ++ ++ if (skip) { ++ const u32 start = btrfs_file_extent_calc_inline_size(0); ++ ++ memmove(inline_data + start, inline_data + start + skip, datal); ++ } ++ ++ write_extent_buffer(path->nodes[0], inline_data, ++ btrfs_item_ptr_offset(path->nodes[0], ++ path->slots[0]), ++ size); ++ inode_add_bytes(dst, datal); ++ ++ return 0; ++} ++ + /** + * btrfs_clone() - clone a range from inode file to another + * +@@ -3586,21 +3730,6 @@ process_slot: + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + u64 skip = 0; + u64 trim = 0; +- u64 aligned_end = 0; +- +- /* +- * Don't copy an inline extent into an offset +- * greater than zero. Having an inline extent +- * at such an offset results in chaos as btrfs +- * isn't prepared for such cases. Just skip +- * this case for the same reasons as commented +- * at btrfs_ioctl_clone(). +- */ +- if (last_dest_end > 0) { +- ret = -EOPNOTSUPP; +- btrfs_end_transaction(trans, root); +- goto out; +- } + + if (off > key.offset) { + skip = off - key.offset; +@@ -3618,42 +3747,22 @@ process_slot: + size -= skip + trim; + datal -= skip + trim; + +- aligned_end = ALIGN(new_key.offset + datal, +- root->sectorsize); +- ret = btrfs_drop_extents(trans, root, inode, +- drop_start, +- aligned_end, +- 1); ++ ret = clone_copy_inline_extent(src, inode, ++ trans, path, ++ &new_key, ++ drop_start, ++ datal, ++ skip, size, buf); + if (ret) { + if (ret != -EOPNOTSUPP) + btrfs_abort_transaction(trans, +- root, ret); +- btrfs_end_transaction(trans, root); +- goto out; +- } +- +- ret = btrfs_insert_empty_item(trans, root, path, +- &new_key, size); +- if (ret) { +- btrfs_abort_transaction(trans, root, +- ret); ++ root, ++ ret); + btrfs_end_transaction(trans, root); + goto out; + } +- +- if (skip) { +- u32 start = +- btrfs_file_extent_calc_inline_size(0); +- memmove(buf+start, buf+start+skip, +- datal); +- } +- + leaf = path->nodes[0]; + slot = path->slots[0]; +- write_extent_buffer(leaf, buf, +- btrfs_item_ptr_offset(leaf, slot), +- size); +- inode_add_bytes(inode, datal); + } + + /* If we have an implicit hole (NO_HOLES feature). */ diff --git a/queue-4.2/btrfs-fix-race-leading-to-bug_on-when-running-delalloc-for-nodatacow.patch b/queue-4.2/btrfs-fix-race-leading-to-bug_on-when-running-delalloc-for-nodatacow.patch new file mode 100644 index 00000000000..b613df92037 --- /dev/null +++ b/queue-4.2/btrfs-fix-race-leading-to-bug_on-when-running-delalloc-for-nodatacow.patch @@ -0,0 +1,122 @@ +From 1d512cb77bdbda80f0dd0620a3b260d697fd581d Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 9 Nov 2015 00:33:58 +0000 +Subject: Btrfs: fix race leading to BUG_ON when running delalloc for nodatacow + +From: Filipe Manana + +commit 1d512cb77bdbda80f0dd0620a3b260d697fd581d upstream. + +If we are using the NO_HOLES feature, we have a tiny time window when +running delalloc for a nodatacow inode where we can race with a concurrent +link or xattr add operation leading to a BUG_ON. + +This happens because at run_delalloc_nocow() we end up casting a leaf item +of type BTRFS_INODE_[REF|EXTREF]_KEY or of type BTRFS_XATTR_ITEM_KEY to a +file extent item (struct btrfs_file_extent_item) and then analyse its +extent type field, which won't match any of the expected extent types +(values BTRFS_FILE_EXTENT_[REG|PREALLOC|INLINE]) and therefore trigger an +explicit BUG_ON(1). + +The following sequence diagram shows how the race happens when running a +no-cow dellaloc range [4K, 8K[ for inode 257 and we have the following +neighbour leafs: + + Leaf X (has N items) Leaf Y + + [ ... (257 INODE_ITEM 0) (257 INODE_REF 256) ] [ (257 EXTENT_DATA 8192), ... ] + slot N - 2 slot N - 1 slot 0 + + (Note the implicit hole for inode 257 regarding the [0, 8K[ range) + + CPU 1 CPU 2 + + run_dealloc_nocow() + btrfs_lookup_file_extent() + --> searches for a key with value + (257 EXTENT_DATA 4096) in the + fs/subvol tree + --> returns us a path with + path->nodes[0] == leaf X and + path->slots[0] == N + + because path->slots[0] is >= + btrfs_header_nritems(leaf X), it + calls btrfs_next_leaf() + + btrfs_next_leaf() + --> releases the path + + hard link added to our inode, + with key (257 INODE_REF 500) + added to the end of leaf X, + so leaf X now has N + 1 keys + + --> searches for the key + (257 INODE_REF 256), because + it was the last key in leaf X + before it released the path, + with path->keep_locks set to 1 + + --> ends up at leaf X again and + it verifies that the key + (257 INODE_REF 256) is no longer + the last key in the leaf, so it + returns with path->nodes[0] == + leaf X and path->slots[0] == N, + pointing to the new item with + key (257 INODE_REF 500) + + the loop iteration of run_dealloc_nocow() + does not break out the loop and continues + because the key referenced in the path + at path->nodes[0] and path->slots[0] is + for inode 257, its type is < BTRFS_EXTENT_DATA_KEY + and its offset (500) is less then our delalloc + range's end (8192) + + the item pointed by the path, an inode reference item, + is (incorrectly) interpreted as a file extent item and + we get an invalid extent type, leading to the BUG_ON(1): + + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + (...) + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + (...) + } else { + BUG_ON(1) + } + +The same can happen if a xattr is added concurrently and ends up having +a key with an offset smaller then the delalloc's range end. + +So fix this by skipping keys with a type smaller than +BTRFS_EXTENT_DATA_KEY. + +Signed-off-by: Filipe Manana +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/inode.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -1294,8 +1294,14 @@ next_slot: + num_bytes = 0; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + +- if (found_key.objectid > ino || +- found_key.type > BTRFS_EXTENT_DATA_KEY || ++ if (found_key.objectid > ino) ++ break; ++ if (WARN_ON_ONCE(found_key.objectid < ino) || ++ found_key.type < BTRFS_EXTENT_DATA_KEY) { ++ path->slots[0]++; ++ goto next_slot; ++ } ++ if (found_key.type > BTRFS_EXTENT_DATA_KEY || + found_key.offset > end) + break; + diff --git a/queue-4.2/btrfs-fix-race-leading-to-incorrect-item-deletion-when-dropping-extents.patch b/queue-4.2/btrfs-fix-race-leading-to-incorrect-item-deletion-when-dropping-extents.patch new file mode 100644 index 00000000000..3211eec4c23 --- /dev/null +++ b/queue-4.2/btrfs-fix-race-leading-to-incorrect-item-deletion-when-dropping-extents.patch @@ -0,0 +1,198 @@ +From aeafbf8486c9e2bd53f5cc3c10c0b7fd7149d69c Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Fri, 6 Nov 2015 13:33:33 +0000 +Subject: Btrfs: fix race leading to incorrect item deletion when dropping extents + +From: Filipe Manana + +commit aeafbf8486c9e2bd53f5cc3c10c0b7fd7149d69c upstream. + +While running a stress test I got the following warning triggered: + + [191627.672810] ------------[ cut here ]------------ + [191627.673949] WARNING: CPU: 8 PID: 8447 at fs/btrfs/file.c:779 __btrfs_drop_extents+0x391/0xa50 [btrfs]() + (...) + [191627.701485] Call Trace: + [191627.702037] [] dump_stack+0x4f/0x7b + [191627.702992] [] ? console_unlock+0x356/0x3a2 + [191627.704091] [] warn_slowpath_common+0xa1/0xbb + [191627.705380] [] ? __btrfs_drop_extents+0x391/0xa50 [btrfs] + [191627.706637] [] warn_slowpath_null+0x1a/0x1c + [191627.707789] [] __btrfs_drop_extents+0x391/0xa50 [btrfs] + [191627.709155] [] ? cache_alloc_debugcheck_after.isra.32+0x171/0x1d0 + [191627.712444] [] ? kmemleak_alloc_recursive.constprop.40+0x16/0x18 + [191627.714162] [] insert_reserved_file_extent.constprop.40+0x83/0x24e [btrfs] + [191627.715887] [] ? start_transaction+0x3bb/0x610 [btrfs] + [191627.717287] [] btrfs_finish_ordered_io+0x273/0x4e2 [btrfs] + [191627.728865] [] finish_ordered_fn+0x15/0x17 [btrfs] + [191627.730045] [] normal_work_helper+0x14c/0x32c [btrfs] + [191627.731256] [] btrfs_endio_write_helper+0x12/0x14 [btrfs] + [191627.732661] [] process_one_work+0x24c/0x4ae + [191627.733822] [] worker_thread+0x206/0x2c2 + [191627.734857] [] ? process_scheduled_works+0x2f/0x2f + [191627.736052] [] ? process_scheduled_works+0x2f/0x2f + [191627.737349] [] kthread+0xef/0xf7 + [191627.738267] [] ? time_hardirqs_on+0x15/0x28 + [191627.739330] [] ? __kthread_parkme+0xad/0xad + [191627.741976] [] ret_from_fork+0x42/0x70 + [191627.743080] [] ? __kthread_parkme+0xad/0xad + [191627.744206] ---[ end trace bbfddacb7aaada8d ]--- + + $ cat -n fs/btrfs/file.c + 691 int __btrfs_drop_extents(struct btrfs_trans_handle *trans, + (...) + 758 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + 759 if (key.objectid > ino || + 760 key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) + 761 break; + 762 + 763 fi = btrfs_item_ptr(leaf, path->slots[0], + 764 struct btrfs_file_extent_item); + 765 extent_type = btrfs_file_extent_type(leaf, fi); + 766 + 767 if (extent_type == BTRFS_FILE_EXTENT_REG || + 768 extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + (...) + 774 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + (...) + 778 } else { + 779 WARN_ON(1); + 780 extent_end = search_start; + 781 } + (...) + +This happened because the item we were processing did not match a file +extent item (its key type != BTRFS_EXTENT_DATA_KEY), and even on this +case we cast the item to a struct btrfs_file_extent_item pointer and +then find a type field value that does not match any of the expected +values (BTRFS_FILE_EXTENT_[REG|PREALLOC|INLINE]). This scenario happens +due to a tiny time window where a race can happen as exemplified below. +For example, consider the following scenario where we're using the +NO_HOLES feature and we have the following two neighbour leafs: + + Leaf X (has N items) Leaf Y + +[ ... (257 INODE_ITEM 0) (257 INODE_REF 256) ] [ (257 EXTENT_DATA 8192), ... ] + slot N - 2 slot N - 1 slot 0 + +Our inode 257 has an implicit hole in the range [0, 8K[ (implicit rather +than explicit because NO_HOLES is enabled). Now if our inode has an +ordered extent for the range [4K, 8K[ that is finishing, the following +can happen: + + CPU 1 CPU 2 + + btrfs_finish_ordered_io() + insert_reserved_file_extent() + __btrfs_drop_extents() + Searches for the key + (257 EXTENT_DATA 4096) through + btrfs_lookup_file_extent() + + Key not found and we get a path where + path->nodes[0] == leaf X and + path->slots[0] == N + + Because path->slots[0] is >= + btrfs_header_nritems(leaf X), we call + btrfs_next_leaf() + + btrfs_next_leaf() releases the path + + inserts key + (257 INODE_REF 4096) + at the end of leaf X, + leaf X now has N + 1 keys, + and the new key is at + slot N + + btrfs_next_leaf() searches for + key (257 INODE_REF 256), with + path->keep_locks set to 1, + because it was the last key it + saw in leaf X + + finds it in leaf X again and + notices it's no longer the last + key of the leaf, so it returns 0 + with path->nodes[0] == leaf X and + path->slots[0] == N (which is now + < btrfs_header_nritems(leaf X)), + pointing to the new key + (257 INODE_REF 4096) + + __btrfs_drop_extents() casts the + item at path->nodes[0], slot + path->slots[0], to a struct + btrfs_file_extent_item - it does + not skip keys for the target + inode with a type less than + BTRFS_EXTENT_DATA_KEY + (BTRFS_INODE_REF_KEY < BTRFS_EXTENT_DATA_KEY) + + sees a bogus value for the type + field triggering the WARN_ON in + the trace shown above, and sets + extent_end = search_start (4096) + + does the if-then-else logic to + fixup 0 length extent items created + by a past bug from hole punching: + + if (extent_end == key.offset && + extent_end >= search_start) + goto delete_extent_item; + + that evaluates to true and it ends + up deleting the key pointed to by + path->slots[0], (257 INODE_REF 4096), + from leaf X + +The same could happen for example for a xattr that ends up having a key +with an offset value that matches search_start (very unlikely but not +impossible). + +So fix this by ensuring that keys smaller than BTRFS_EXTENT_DATA_KEY are +skipped, never casted to struct btrfs_file_extent_item and never deleted +by accident. Also protect against the unexpected case of getting a key +for a lower inode number by skipping that key and issuing a warning. + +Signed-off-by: Filipe Manana +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/file.c | 16 ++++++++++++---- + 1 file changed, 12 insertions(+), 4 deletions(-) + +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -756,8 +756,16 @@ next_slot: + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); +- if (key.objectid > ino || +- key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) ++ ++ if (key.objectid > ino) ++ break; ++ if (WARN_ON_ONCE(key.objectid < ino) || ++ key.type < BTRFS_EXTENT_DATA_KEY) { ++ ASSERT(del_nr == 0); ++ path->slots[0]++; ++ goto next_slot; ++ } ++ if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) + break; + + fi = btrfs_item_ptr(leaf, path->slots[0], +@@ -776,8 +784,8 @@ next_slot: + btrfs_file_extent_inline_len(leaf, + path->slots[0], fi); + } else { +- WARN_ON(1); +- extent_end = search_start; ++ /* can't happen */ ++ BUG(); + } + + /* diff --git a/queue-4.2/btrfs-fix-race-when-listing-an-inode-s-xattrs.patch b/queue-4.2/btrfs-fix-race-when-listing-an-inode-s-xattrs.patch new file mode 100644 index 00000000000..0a13530cd6c --- /dev/null +++ b/queue-4.2/btrfs-fix-race-when-listing-an-inode-s-xattrs.patch @@ -0,0 +1,92 @@ +From f1cd1f0b7d1b5d4aaa5711e8f4e4898b0045cb6d Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 9 Nov 2015 18:06:38 +0000 +Subject: Btrfs: fix race when listing an inode's xattrs + +From: Filipe Manana + +commit f1cd1f0b7d1b5d4aaa5711e8f4e4898b0045cb6d upstream. + +When listing a inode's xattrs we have a time window where we race against +a concurrent operation for adding a new hard link for our inode that makes +us not return any xattr to user space. In order for this to happen, the +first xattr of our inode needs to be at slot 0 of a leaf and the previous +leaf must still have room for an inode ref (or extref) item, and this can +happen because an inode's listxattrs callback does not lock the inode's +i_mutex (nor does the VFS does it for us), but adding a hard link to an +inode makes the VFS lock the inode's i_mutex before calling the inode's +link callback. + +If we have the following leafs: + + Leaf X (has N items) Leaf Y + + [ ... (257 INODE_ITEM 0) (257 INODE_REF 256) ] [ (257 XATTR_ITEM 12345), ... ] + slot N - 2 slot N - 1 slot 0 + +The race illustrated by the following sequence diagram is possible: + + CPU 1 CPU 2 + + btrfs_listxattr() + + searches for key (257 XATTR_ITEM 0) + + gets path with path->nodes[0] == leaf X + and path->slots[0] == N + + because path->slots[0] is >= + btrfs_header_nritems(leaf X), it calls + btrfs_next_leaf() + + btrfs_next_leaf() + releases the path + + adds key (257 INODE_REF 666) + to the end of leaf X (slot N), + and leaf X now has N + 1 items + + searches for the key (257 INODE_REF 256), + with path->keep_locks == 1, because that + is the last key it saw in leaf X before + releasing the path + + ends up at leaf X again and it verifies + that the key (257 INODE_REF 256) is no + longer the last key in leaf X, so it + returns with path->nodes[0] == leaf X + and path->slots[0] == N, pointing to + the new item with key (257 INODE_REF 666) + + btrfs_listxattr's loop iteration sees that + the type of the key pointed by the path is + different from the type BTRFS_XATTR_ITEM_KEY + and so it breaks the loop and stops looking + for more xattr items + --> the application doesn't get any xattr + listed for our inode + +So fix this by breaking the loop only if the key's type is greater than +BTRFS_XATTR_ITEM_KEY and skip the current key if its type is smaller. + +Signed-off-by: Filipe Manana +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/xattr.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/xattr.c ++++ b/fs/btrfs/xattr.c +@@ -313,8 +313,10 @@ ssize_t btrfs_listxattr(struct dentry *d + /* check to make sure this item is what we want */ + if (found_key.objectid != key.objectid) + break; +- if (found_key.type != BTRFS_XATTR_ITEM_KEY) ++ if (found_key.type > BTRFS_XATTR_ITEM_KEY) + break; ++ if (found_key.type < BTRFS_XATTR_ITEM_KEY) ++ goto next; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + if (verify_dir_item(root, leaf, di)) diff --git a/queue-4.2/btrfs-fix-regression-when-running-delayed-references.patch b/queue-4.2/btrfs-fix-regression-when-running-delayed-references.patch new file mode 100644 index 00000000000..c652e88380e --- /dev/null +++ b/queue-4.2/btrfs-fix-regression-when-running-delayed-references.patch @@ -0,0 +1,307 @@ +From 2c3cf7d5f6105bb957df125dfce61d4483b8742d Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Thu, 22 Oct 2015 09:47:34 +0100 +Subject: Btrfs: fix regression when running delayed references +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Filipe Manana + +commit 2c3cf7d5f6105bb957df125dfce61d4483b8742d upstream. + +In the kernel 4.2 merge window we had a refactoring/rework of the delayed +references implementation in order to fix certain problems with qgroups. +However that rework introduced one more regression that leads to the +following trace when running delayed references for metadata: + +[35908.064664] kernel BUG at fs/btrfs/extent-tree.c:1832! +[35908.065201] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC +[35908.065201] Modules linked in: dm_flakey dm_mod btrfs crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse parport_pc psmouse i2 +[35908.065201] CPU: 14 PID: 15014 Comm: kworker/u32:9 Tainted: G W 4.3.0-rc5-btrfs-next-17+ #1 +[35908.065201] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014 +[35908.065201] Workqueue: btrfs-extent-refs btrfs_extent_refs_helper [btrfs] +[35908.065201] task: ffff880114b7d780 ti: ffff88010c4c8000 task.ti: ffff88010c4c8000 +[35908.065201] RIP: 0010:[] [] insert_inline_extent_backref+0x52/0xb1 [btrfs] +[35908.065201] RSP: 0018:ffff88010c4cbb08 EFLAGS: 00010293 +[35908.065201] RAX: 0000000000000000 RBX: ffff88008a661000 RCX: 0000000000000000 +[35908.065201] RDX: ffffffffa04dd58f RSI: 0000000000000001 RDI: 0000000000000000 +[35908.065201] RBP: ffff88010c4cbb40 R08: 0000000000001000 R09: ffff88010c4cb9f8 +[35908.065201] R10: 0000000000000000 R11: 000000000000002c R12: 0000000000000000 +[35908.065201] R13: ffff88020a74c578 R14: 0000000000000000 R15: 0000000000000000 +[35908.065201] FS: 0000000000000000(0000) GS:ffff88023edc0000(0000) knlGS:0000000000000000 +[35908.065201] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b +[35908.065201] CR2: 00000000015e8708 CR3: 0000000102185000 CR4: 00000000000006e0 +[35908.065201] Stack: +[35908.065201] ffff88010c4cbb18 0000000000000f37 ffff88020a74c578 ffff88015a408000 +[35908.065201] ffff880154a44000 0000000000000000 0000000000000005 ffff88010c4cbbd8 +[35908.065201] ffffffffa0492b9a 0000000000000005 0000000000000000 0000000000000000 +[35908.065201] Call Trace: +[35908.065201] [] __btrfs_inc_extent_ref+0x8b/0x208 [btrfs] +[35908.065201] [] ? __btrfs_run_delayed_refs+0x4d4/0xd33 [btrfs] +[35908.065201] [] __btrfs_run_delayed_refs+0xafa/0xd33 [btrfs] +[35908.065201] [] ? join_transaction.isra.10+0x25/0x41f [btrfs] +[35908.065201] [] ? join_transaction.isra.10+0xa8/0x41f [btrfs] +[35908.065201] [] btrfs_run_delayed_refs+0x75/0x1dd [btrfs] +[35908.065201] [] delayed_ref_async_start+0x3c/0x7b [btrfs] +[35908.065201] [] normal_work_helper+0x14c/0x32a [btrfs] +[35908.065201] [] btrfs_extent_refs_helper+0x12/0x14 [btrfs] +[35908.065201] [] process_one_work+0x24a/0x4ac +[35908.065201] [] worker_thread+0x206/0x2c2 +[35908.065201] [] ? rescuer_thread+0x2cb/0x2cb +[35908.065201] [] ? rescuer_thread+0x2cb/0x2cb +[35908.065201] [] kthread+0xef/0xf7 +[35908.065201] [] ? kthread_parkme+0x24/0x24 +[35908.065201] [] ret_from_fork+0x3f/0x70 +[35908.065201] [] ? kthread_parkme+0x24/0x24 +[35908.065201] Code: 6a 01 41 56 41 54 ff 75 10 41 51 4d 89 c1 49 89 c8 48 8d 4d d0 e8 f6 f1 ff ff 48 83 c4 28 85 c0 75 2c 49 81 fc ff 00 00 00 77 02 <0f> 0b 4c 8b 45 30 8b 4d 28 45 31 +[35908.065201] RIP [] insert_inline_extent_backref+0x52/0xb1 [btrfs] +[35908.065201] RSP +[35908.310885] ---[ end trace fe4299baf0666457 ]--- + +This happens because the new delayed references code no longer merges +delayed references that have different sequence values. The following +steps are an example sequence leading to this issue: + +1) Transaction N starts, fs_info->tree_mod_seq has value 0; + +2) Extent buffer (btree node) A is allocated, delayed reference Ref1 for + bytenr A is created, with a value of 1 and a seq value of 0; + +3) fs_info->tree_mod_seq is incremented to 1; + +4) Extent buffer A is deleted through btrfs_del_items(), which calls + btrfs_del_leaf(), which in turn calls btrfs_free_tree_block(). The + later returns the metadata extent associated to extent buffer A to + the free space cache (the range is not pinned), because the extent + buffer was created in the current transaction (N) and writeback never + happened for the extent buffer (flag BTRFS_HEADER_FLAG_WRITTEN not set + in the extent buffer). + This creates the delayed reference Ref2 for bytenr A, with a value + of -1 and a seq value of 1; + +5) Delayed reference Ref2 is not merged with Ref1 when we create it, + because they have different sequence numbers (decided at + add_delayed_ref_tail_merge()); + +6) fs_info->tree_mod_seq is incremented to 2; + +7) Some task attempts to allocate a new extent buffer (done at + extent-tree.c:find_free_extent()), but due to heavy fragmentation + and running low on metadata space the clustered allocation fails + and we fall back to unclustered allocation, which finds the + extent at offset A, so a new extent buffer at offset A is allocated. + This creates delayed reference Ref3 for bytenr A, with a value of 1 + and a seq value of 2; + +8) Ref3 is not merged neither with Ref2 nor Ref1, again because they + all have different seq values; + +9) We start running the delayed references (__btrfs_run_delayed_refs()); + +10) The delayed Ref1 is the first one being applied, which ends up + creating an inline extent backref in the extent tree; + +10) Next the delayed reference Ref3 is selected for execution, and not + Ref2, because select_delayed_ref() always gives a preference for + positive references (that have an action of BTRFS_ADD_DELAYED_REF); + +11) When running Ref3 we encounter alreay the inline extent backref + in the extent tree at insert_inline_extent_backref(), which makes + us hit the following BUG_ON: + + BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); + + This is always true because owner corresponds to the level of the + extent buffer/btree node in the btree. + +For the scenario described above we hit the BUG_ON because we never merge +references that have different seq values. + +We used to do the merging before the 4.2 kernel, more specifically, before +the commmits: + + c6fc24549960 ("btrfs: delayed-ref: Use list to replace the ref_root in ref_head.") + c43d160fcd5e ("btrfs: delayed-ref: Cleanup the unneeded functions.") + +This issue became more exposed after the following change that was added +to 4.2 as well: + + cffc3374e567 ("Btrfs: fix order by which delayed references are run") + +Which in turn fixed another regression by the two commits previously +mentioned. + +So fix this by bringing back the delayed reference merge code, with the +proper adaptations so that it operates against the new data structure +(linked list vs old red black tree implementation). + +This issue was hit running fstest btrfs/063 in a loop. Several people have +reported this issue in the mailing list when running on kernels 4.2+. + +Very special thanks to Stéphane Lesimple for helping debugging this issue +and testing this fix on his multi terabyte filesystem (which took more +than one day to balance alone, plus fsck, etc). + +Fixes: c6fc24549960 ("btrfs: delayed-ref: Use list to replace the ref_root in ref_head.") +Reported-by: Peter Becker +Reported-by: Stéphane Lesimple +Tested-by: Stéphane Lesimple +Reported-by: Malte Schröder +Reported-by: Derek Dongray +Reported-by: Erkki Seppala +Signed-off-by: Filipe Manana +Reviewed-by: Liu Bo +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/delayed-ref.c | 113 +++++++++++++++++++++++++++++++++++++++++++++++++ + fs/btrfs/extent-tree.c | 14 ++++++ + 2 files changed, 127 insertions(+) + +--- a/fs/btrfs/delayed-ref.c ++++ b/fs/btrfs/delayed-ref.c +@@ -197,6 +197,119 @@ static inline void drop_delayed_ref(stru + trans->delayed_ref_updates--; + } + ++static bool merge_ref(struct btrfs_trans_handle *trans, ++ struct btrfs_delayed_ref_root *delayed_refs, ++ struct btrfs_delayed_ref_head *head, ++ struct btrfs_delayed_ref_node *ref, ++ u64 seq) ++{ ++ struct btrfs_delayed_ref_node *next; ++ bool done = false; ++ ++ next = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, ++ list); ++ while (!done && &next->list != &head->ref_list) { ++ int mod; ++ struct btrfs_delayed_ref_node *next2; ++ ++ next2 = list_next_entry(next, list); ++ ++ if (next == ref) ++ goto next; ++ ++ if (seq && next->seq >= seq) ++ goto next; ++ ++ if (next->type != ref->type || next->no_quota != ref->no_quota) ++ goto next; ++ ++ if ((ref->type == BTRFS_TREE_BLOCK_REF_KEY || ++ ref->type == BTRFS_SHARED_BLOCK_REF_KEY) && ++ comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref), ++ btrfs_delayed_node_to_tree_ref(next), ++ ref->type)) ++ goto next; ++ if ((ref->type == BTRFS_EXTENT_DATA_REF_KEY || ++ ref->type == BTRFS_SHARED_DATA_REF_KEY) && ++ comp_data_refs(btrfs_delayed_node_to_data_ref(ref), ++ btrfs_delayed_node_to_data_ref(next))) ++ goto next; ++ ++ if (ref->action == next->action) { ++ mod = next->ref_mod; ++ } else { ++ if (ref->ref_mod < next->ref_mod) { ++ swap(ref, next); ++ done = true; ++ } ++ mod = -next->ref_mod; ++ } ++ ++ drop_delayed_ref(trans, delayed_refs, head, next); ++ ref->ref_mod += mod; ++ if (ref->ref_mod == 0) { ++ drop_delayed_ref(trans, delayed_refs, head, ref); ++ done = true; ++ } else { ++ /* ++ * Can't have multiples of the same ref on a tree block. ++ */ ++ WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || ++ ref->type == BTRFS_SHARED_BLOCK_REF_KEY); ++ } ++next: ++ next = next2; ++ } ++ ++ return done; ++} ++ ++void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, ++ struct btrfs_fs_info *fs_info, ++ struct btrfs_delayed_ref_root *delayed_refs, ++ struct btrfs_delayed_ref_head *head) ++{ ++ struct btrfs_delayed_ref_node *ref; ++ u64 seq = 0; ++ ++ assert_spin_locked(&head->lock); ++ ++ if (list_empty(&head->ref_list)) ++ return; ++ ++ /* We don't have too many refs to merge for data. */ ++ if (head->is_data) ++ return; ++ ++ spin_lock(&fs_info->tree_mod_seq_lock); ++ if (!list_empty(&fs_info->tree_mod_seq_list)) { ++ struct seq_list *elem; ++ ++ elem = list_first_entry(&fs_info->tree_mod_seq_list, ++ struct seq_list, list); ++ seq = elem->seq; ++ } ++ spin_unlock(&fs_info->tree_mod_seq_lock); ++ ++ ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, ++ list); ++ while (&ref->list != &head->ref_list) { ++ if (seq && ref->seq >= seq) ++ goto next; ++ ++ if (merge_ref(trans, delayed_refs, head, ref, seq)) { ++ if (list_empty(&head->ref_list)) ++ break; ++ ref = list_first_entry(&head->ref_list, ++ struct btrfs_delayed_ref_node, ++ list); ++ continue; ++ } ++next: ++ ref = list_next_entry(ref, list); ++ } ++} ++ + int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + u64 seq) +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -2365,7 +2365,21 @@ static noinline int __btrfs_run_delayed_ + } + } + ++ /* ++ * We need to try and merge add/drops of the same ref since we ++ * can run into issues with relocate dropping the implicit ref ++ * and then it being added back again before the drop can ++ * finish. If we merged anything we need to re-loop so we can ++ * get a good ref. ++ * Or we can get node references of the same type that weren't ++ * merged when created due to bumps in the tree mod seq, and ++ * we need to merge them to prevent adding an inline extent ++ * backref before dropping it (triggering a BUG_ON at ++ * insert_inline_extent_backref()). ++ */ + spin_lock(&locked_ref->lock); ++ btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, ++ locked_ref); + + /* + * locked_ref is the head node, so we have to go one diff --git a/queue-4.2/btrfs-fix-resending-received-snapshot-with-parent.patch b/queue-4.2/btrfs-fix-resending-received-snapshot-with-parent.patch new file mode 100644 index 00000000000..daa05617ff4 --- /dev/null +++ b/queue-4.2/btrfs-fix-resending-received-snapshot-with-parent.patch @@ -0,0 +1,76 @@ +From b96b1db039ebc584d03a9933b279e0d3e704c528 Mon Sep 17 00:00:00 2001 +From: Robin Ruede +Date: Wed, 30 Sep 2015 21:23:33 +0200 +Subject: btrfs: fix resending received snapshot with parent + +From: Robin Ruede + +commit b96b1db039ebc584d03a9933b279e0d3e704c528 upstream. + +This fixes a regression introduced by 37b8d27d between v4.1 and v4.2. + +When a snapshot is received, its received_uuid is set to the original +uuid of the subvolume. When that snapshot is then resent to a third +filesystem, it's received_uuid is set to the second uuid +instead of the original one. The same was true for the parent_uuid. +This behaviour was partially changed in 37b8d27d, but in that patch +only the parent_uuid was taken from the real original, +not the uuid itself, causing the search for the parent to fail in +the case below. + +This happens for example when trying to send a series of linked +snapshots (e.g. created by snapper) from the backup file system back +to the original one. + +The following commands reproduce the issue in v4.2.1 +(no error in 4.1.6) + + # setup three test file systems + for i in 1 2 3; do + truncate -s 50M fs$i + mkfs.btrfs fs$i + mkdir $i + mount fs$i $i + done + echo "content" > 1/testfile + btrfs su snapshot -r 1/ 1/snap1 + echo "changed content" > 1/testfile + btrfs su snapshot -r 1/ 1/snap2 + + # works fine: + btrfs send 1/snap1 | btrfs receive 2/ + btrfs send -p 1/snap1 1/snap2 | btrfs receive 2/ + + # ERROR: could not find parent subvolume + btrfs send 2/snap1 | btrfs receive 3/ + btrfs send -p 2/snap1 2/snap2 | btrfs receive 3/ + +Signed-off-by: Robin Ruede +Fixes: 37b8d27de5d0 ("Btrfs: use received_uuid of parent during send") +Reviewed-by: Filipe Manana +Tested-by: Ed Tomlinson +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/send.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/send.c ++++ b/fs/btrfs/send.c +@@ -2351,8 +2351,14 @@ static int send_subvol_begin(struct send + } + + TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen); +- TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, +- sctx->send_root->root_item.uuid); ++ ++ if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid)) ++ TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, ++ sctx->send_root->root_item.received_uuid); ++ else ++ TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, ++ sctx->send_root->root_item.uuid); ++ + TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, + le64_to_cpu(sctx->send_root->root_item.ctransid)); + if (parent_root) { diff --git a/queue-4.2/btrfs-fix-signed-overflows-in-btrfs_sync_file.patch b/queue-4.2/btrfs-fix-signed-overflows-in-btrfs_sync_file.patch new file mode 100644 index 00000000000..b4eb4f501cb --- /dev/null +++ b/queue-4.2/btrfs-fix-signed-overflows-in-btrfs_sync_file.patch @@ -0,0 +1,66 @@ +From 9dcbeed4d7e11e1dcf5e55475de3754f0855d1c2 Mon Sep 17 00:00:00 2001 +From: David Sterba +Date: Mon, 9 Nov 2015 11:44:45 +0100 +Subject: btrfs: fix signed overflows in btrfs_sync_file + +From: David Sterba + +commit 9dcbeed4d7e11e1dcf5e55475de3754f0855d1c2 upstream. + +The calculation of range length in btrfs_sync_file leads to signed +overflow. This was caught by PaX gcc SIZE_OVERFLOW plugin. + +https://forums.grsecurity.net/viewtopic.php?f=1&t=4284 + +The fsync call passes 0 and LLONG_MAX, the range length does not fit to +loff_t and overflows, but the value is converted to u64 so it silently +works as expected. + +The minimal fix is a typecast to u64, switching functions to take +(start, end) instead of (start, len) would be more intrusive. + +Coccinelle script found that there's one more opencoded calculation of +the length. + + +@@ +loff_t start, end; +@@ +* end - start + + +Signed-off-by: David Sterba +Signed-off-by: Chris Mason +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/file.c | 10 +++++++--- + 1 file changed, 7 insertions(+), 3 deletions(-) + +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -1876,8 +1876,13 @@ int btrfs_sync_file(struct file *file, l + struct btrfs_log_ctx ctx; + int ret = 0; + bool full_sync = 0; +- const u64 len = end - start + 1; ++ u64 len; + ++ /* ++ * The range length can be represented by u64, we have to do the typecasts ++ * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync() ++ */ ++ len = (u64)end - (u64)start + 1; + trace_btrfs_sync_file(file, datasync); + + /* +@@ -2065,8 +2070,7 @@ int btrfs_sync_file(struct file *file, l + } + } + if (!full_sync) { +- ret = btrfs_wait_ordered_range(inode, start, +- end - start + 1); ++ ret = btrfs_wait_ordered_range(inode, start, len); + if (ret) { + btrfs_end_transaction(trans, root); + goto out; diff --git a/queue-4.2/btrfs-fix-truncation-of-compressed-and-inlined-extents.patch b/queue-4.2/btrfs-fix-truncation-of-compressed-and-inlined-extents.patch new file mode 100644 index 00000000000..d4d7f449ef2 --- /dev/null +++ b/queue-4.2/btrfs-fix-truncation-of-compressed-and-inlined-extents.patch @@ -0,0 +1,288 @@ +From 0305cd5f7fca85dae392b9ba85b116896eb7c1c7 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Fri, 16 Oct 2015 12:34:25 +0100 +Subject: Btrfs: fix truncation of compressed and inlined extents + +From: Filipe Manana + +commit 0305cd5f7fca85dae392b9ba85b116896eb7c1c7 upstream. + +When truncating a file to a smaller size which consists of an inline +extent that is compressed, we did not discard (or made unusable) the +data between the new file size and the old file size, wasting metadata +space and allowing for the truncated data to be leaked and the data +corruption/loss mentioned below. +We were also not correctly decrementing the number of bytes used by the +inode, we were setting it to zero, giving a wrong report for callers of +the stat(2) syscall. The fsck tool also reported an error about a mismatch +between the nbytes of the file versus the real space used by the file. + +Now because we weren't discarding the truncated region of the file, it +was possible for a caller of the clone ioctl to actually read the data +that was truncated, allowing for a security breach without requiring root +access to the system, using only standard filesystem operations. The +scenario is the following: + + 1) User A creates a file which consists of an inline and compressed + extent with a size of 2000 bytes - the file is not accessible to + any other users (no read, write or execution permission for anyone + else); + + 2) The user truncates the file to a size of 1000 bytes; + + 3) User A makes the file world readable; + + 4) User B creates a file consisting of an inline extent of 2000 bytes; + + 5) User B issues a clone operation from user A's file into its own + file (using a length argument of 0, clone the whole range); + + 6) User B now gets to see the 1000 bytes that user A truncated from + its file before it made its file world readbale. User B also lost + the bytes in the range [1000, 2000[ bytes from its own file, but + that might be ok if his/her intention was reading stale data from + user A that was never supposed to be public. + +Note that this contrasts with the case where we truncate a file from 2000 +bytes to 1000 bytes and then truncate it back from 1000 to 2000 bytes. In +this case reading any byte from the range [1000, 2000[ will return a value +of 0x00, instead of the original data. + +This problem exists since the clone ioctl was added and happens both with +and without my recent data loss and file corruption fixes for the clone +ioctl (patch "Btrfs: fix file corruption and data loss after cloning +inline extents"). + +So fix this by truncating the compressed inline extents as we do for the +non-compressed case, which involves decompressing, if the data isn't already +in the page cache, compressing the truncated version of the extent, writing +the compressed content into the inline extent and then truncate it. + +The following test case for fstests reproduces the problem. In order for +the test to pass both this fix and my previous fix for the clone ioctl +that forbids cloning a smaller inline extent into a larger one, +which is titled "Btrfs: fix file corruption and data loss after cloning +inline extents", are needed. Without that other fix the test fails in a +different way that does not leak the truncated data, instead part of +destination file gets replaced with zeroes (because the destination file +has a larger inline extent than the source). + + seq=`basename $0` + seqres=$RESULT_DIR/$seq + echo "QA output created by $seq" + tmp=/tmp/$$ + status=1 # failure is the default! + trap "_cleanup; exit \$status" 0 1 2 3 15 + + _cleanup() + { + rm -f $tmp.* + } + + # get standard environment, filters and checks + . ./common/rc + . ./common/filter + + # real QA test starts here + _need_to_be_root + _supported_fs btrfs + _supported_os Linux + _require_scratch + _require_cloner + + rm -f $seqres.full + + _scratch_mkfs >>$seqres.full 2>&1 + _scratch_mount "-o compress" + + # Create our test files. File foo is going to be the source of a clone operation + # and consists of a single inline extent with an uncompressed size of 512 bytes, + # while file bar consists of a single inline extent with an uncompressed size of + # 256 bytes. For our test's purpose, it's important that file bar has an inline + # extent with a size smaller than foo's inline extent. + $XFS_IO_PROG -f -c "pwrite -S 0xa1 0 128" \ + -c "pwrite -S 0x2a 128 384" \ + $SCRATCH_MNT/foo | _filter_xfs_io + $XFS_IO_PROG -f -c "pwrite -S 0xbb 0 256" $SCRATCH_MNT/bar | _filter_xfs_io + + # Now durably persist all metadata and data. We do this to make sure that we get + # on disk an inline extent with a size of 512 bytes for file foo. + sync + + # Now truncate our file foo to a smaller size. Because it consists of a + # compressed and inline extent, btrfs did not shrink the inline extent to the + # new size (if the extent was not compressed, btrfs would shrink it to 128 + # bytes), it only updates the inode's i_size to 128 bytes. + $XFS_IO_PROG -c "truncate 128" $SCRATCH_MNT/foo + + # Now clone foo's inline extent into bar. + # This clone operation should fail with errno EOPNOTSUPP because the source + # file consists only of an inline extent and the file's size is smaller than + # the inline extent of the destination (128 bytes < 256 bytes). However the + # clone ioctl was not prepared to deal with a file that has a size smaller + # than the size of its inline extent (something that happens only for compressed + # inline extents), resulting in copying the full inline extent from the source + # file into the destination file. + # + # Note that btrfs' clone operation for inline extents consists of removing the + # inline extent from the destination inode and copy the inline extent from the + # source inode into the destination inode, meaning that if the destination + # inode's inline extent is larger (N bytes) than the source inode's inline + # extent (M bytes), some bytes (N - M bytes) will be lost from the destination + # file. Btrfs could copy the source inline extent's data into the destination's + # inline extent so that we would not lose any data, but that's currently not + # done due to the complexity that would be needed to deal with such cases + # (specially when one or both extents are compressed), returning EOPNOTSUPP, as + # it's normally not a very common case to clone very small files (only case + # where we get inline extents) and copying inline extents does not save any + # space (unlike for normal, non-inlined extents). + $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/foo $SCRATCH_MNT/bar + + # Now because the above clone operation used to succeed, and due to foo's inline + # extent not being shinked by the truncate operation, our file bar got the whole + # inline extent copied from foo, making us lose the last 128 bytes from bar + # which got replaced by the bytes in range [128, 256[ from foo before foo was + # truncated - in other words, data loss from bar and being able to read old and + # stale data from foo that should not be possible to read anymore through normal + # filesystem operations. Contrast with the case where we truncate a file from a + # size N to a smaller size M, truncate it back to size N and then read the range + # [M, N[, we should always get the value 0x00 for all the bytes in that range. + + # We expected the clone operation to fail with errno EOPNOTSUPP and therefore + # not modify our file's bar data/metadata. So its content should be 256 bytes + # long with all bytes having the value 0xbb. + # + # Without the btrfs bug fix, the clone operation succeeded and resulted in + # leaking truncated data from foo, the bytes that belonged to its range + # [128, 256[, and losing data from bar in that same range. So reading the + # file gave us the following content: + # + # 0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 + # * + # 0000200 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a + # * + # 0000400 + echo "File bar's content after the clone operation:" + od -t x1 $SCRATCH_MNT/bar + + # Also because the foo's inline extent was not shrunk by the truncate + # operation, btrfs' fsck, which is run by the fstests framework everytime a + # test completes, failed reporting the following error: + # + # root 5 inode 257 errors 400, nbytes wrong + + status=0 + exit + +Signed-off-by: Filipe Manana +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/inode.c | 82 +++++++++++++++++++++++++++++++++++++++++++++---------- + 1 file changed, 68 insertions(+), 14 deletions(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -4184,6 +4184,47 @@ static int truncate_space_check(struct b + + } + ++static int truncate_inline_extent(struct inode *inode, ++ struct btrfs_path *path, ++ struct btrfs_key *found_key, ++ const u64 item_end, ++ const u64 new_size) ++{ ++ struct extent_buffer *leaf = path->nodes[0]; ++ int slot = path->slots[0]; ++ struct btrfs_file_extent_item *fi; ++ u32 size = (u32)(new_size - found_key->offset); ++ struct btrfs_root *root = BTRFS_I(inode)->root; ++ ++ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); ++ ++ if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) { ++ loff_t offset = new_size; ++ loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE); ++ ++ /* ++ * Zero out the remaining of the last page of our inline extent, ++ * instead of directly truncating our inline extent here - that ++ * would be much more complex (decompressing all the data, then ++ * compressing the truncated data, which might be bigger than ++ * the size of the inline extent, resize the extent, etc). ++ * We release the path because to get the page we might need to ++ * read the extent item from disk (data not in the page cache). ++ */ ++ btrfs_release_path(path); ++ return btrfs_truncate_page(inode, offset, page_end - offset, 0); ++ } ++ ++ btrfs_set_file_extent_ram_bytes(leaf, fi, size); ++ size = btrfs_file_extent_calc_inline_size(size); ++ btrfs_truncate_item(root, path, size, 1); ++ ++ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) ++ inode_sub_bytes(inode, item_end + 1 - new_size); ++ ++ return 0; ++} ++ + /* + * this can truncate away extent items, csum items and directory items. + * It starts at a high offset and removes keys until it can't find +@@ -4378,27 +4419,40 @@ search_again: + * special encodings + */ + if (!del_item && +- btrfs_file_extent_compression(leaf, fi) == 0 && + btrfs_file_extent_encryption(leaf, fi) == 0 && + btrfs_file_extent_other_encoding(leaf, fi) == 0) { +- u32 size = new_size - found_key.offset; +- +- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) +- inode_sub_bytes(inode, item_end + 1 - +- new_size); + + /* +- * update the ram bytes to properly reflect +- * the new size of our item ++ * Need to release path in order to truncate a ++ * compressed extent. So delete any accumulated ++ * extent items so far. + */ +- btrfs_set_file_extent_ram_bytes(leaf, fi, size); +- size = +- btrfs_file_extent_calc_inline_size(size); +- btrfs_truncate_item(root, path, size, 1); ++ if (btrfs_file_extent_compression(leaf, fi) != ++ BTRFS_COMPRESS_NONE && pending_del_nr) { ++ err = btrfs_del_items(trans, root, path, ++ pending_del_slot, ++ pending_del_nr); ++ if (err) { ++ btrfs_abort_transaction(trans, ++ root, ++ err); ++ goto error; ++ } ++ pending_del_nr = 0; ++ } ++ ++ err = truncate_inline_extent(inode, path, ++ &found_key, ++ item_end, ++ new_size); ++ if (err) { ++ btrfs_abort_transaction(trans, ++ root, err); ++ goto error; ++ } + } else if (test_bit(BTRFS_ROOT_REF_COWS, + &root->state)) { +- inode_sub_bytes(inode, item_end + 1 - +- found_key.offset); ++ inode_sub_bytes(inode, item_end + 1 - new_size); + } + } + delete: diff --git a/queue-4.2/ceph-fix-message-length-computation.patch b/queue-4.2/ceph-fix-message-length-computation.patch new file mode 100644 index 00000000000..920bc6295a1 --- /dev/null +++ b/queue-4.2/ceph-fix-message-length-computation.patch @@ -0,0 +1,37 @@ +From 777d738a5e58ba3b6f3932ab1543ce93703f4873 Mon Sep 17 00:00:00 2001 +From: Arnd Bergmann +Date: Wed, 30 Sep 2015 15:04:42 +0200 +Subject: ceph: fix message length computation + +From: Arnd Bergmann + +commit 777d738a5e58ba3b6f3932ab1543ce93703f4873 upstream. + +create_request_message() computes the maximum length of a message, +but uses the wrong type for the time stamp: sizeof(struct timespec) +may be 8 or 16 depending on the architecture, while sizeof(struct +ceph_timespec) is always 8, and that is what gets put into the +message. + +Found while auditing the uses of timespec for y2038 problems. + +Fixes: b8e69066d8af ("ceph: include time stamp in every MDS request") +Signed-off-by: Arnd Bergmann +Signed-off-by: Yan, Zheng +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ceph/mds_client.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ceph/mds_client.c ++++ b/fs/ceph/mds_client.c +@@ -1935,7 +1935,7 @@ static struct ceph_msg *create_request_m + + len = sizeof(*head) + + pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) + +- sizeof(struct timespec); ++ sizeof(struct ceph_timespec); + + /* calculate (max) length for cap releases */ + len += sizeof(struct ceph_mds_request_release) * diff --git a/queue-4.2/cobalt-fix-kconfig-dependency.patch b/queue-4.2/cobalt-fix-kconfig-dependency.patch new file mode 100644 index 00000000000..bf87144c559 --- /dev/null +++ b/queue-4.2/cobalt-fix-kconfig-dependency.patch @@ -0,0 +1,55 @@ +From fc88dd16a0e430f57458e6bd9b62a631c6ea53a1 Mon Sep 17 00:00:00 2001 +From: Hans Verkuil +Date: Mon, 21 Sep 2015 08:42:04 -0300 +Subject: [media] cobalt: fix Kconfig dependency + +From: Hans Verkuil + +commit fc88dd16a0e430f57458e6bd9b62a631c6ea53a1 upstream. + +The cobalt driver should depend on VIDEO_V4L2_SUBDEV_API. + +This fixes this kbuild error: + +tree: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git master +head: 99bc7215bc60f6cd414cf1b85cd9d52cc596cccb +commit: 85756a069c55e0315ac5990806899cfb607b987f [media] cobalt: add new driver +config: x86_64-randconfig-s0-09201514 (attached as .config) +reproduce: + git checkout 85756a069c55e0315ac5990806899cfb607b987f + # save the attached .config to linux build tree + make ARCH=x86_64 + +All error/warnings (new ones prefixed by >>): + + drivers/media/i2c/adv7604.c: In function 'adv76xx_get_format': +>> drivers/media/i2c/adv7604.c:1853:9: error: implicit declaration of function 'v4l2_subdev_get_try_format' [-Werror=implicit-function-declaration] + fmt = v4l2_subdev_get_try_format(sd, cfg, format->pad); + ^ + drivers/media/i2c/adv7604.c:1853:7: warning: assignment makes pointer from integer without a cast [-Wint-conversion] + fmt = v4l2_subdev_get_try_format(sd, cfg, format->pad); + ^ + drivers/media/i2c/adv7604.c: In function 'adv76xx_set_format': + drivers/media/i2c/adv7604.c:1882:7: warning: assignment makes pointer from integer without a cast [-Wint-conversion] + fmt = v4l2_subdev_get_try_format(sd, cfg, format->pad); + ^ + cc1: some warnings being treated as errors + +Signed-off-by: Hans Verkuil +Signed-off-by: Mauro Carvalho Chehab +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/media/pci/cobalt/Kconfig | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/media/pci/cobalt/Kconfig ++++ b/drivers/media/pci/cobalt/Kconfig +@@ -1,6 +1,6 @@ + config VIDEO_COBALT + tristate "Cisco Cobalt support" +- depends on VIDEO_V4L2 && I2C && MEDIA_CONTROLLER ++ depends on VIDEO_V4L2 && I2C && VIDEO_V4L2_SUBDEV_API + depends on PCI_MSI && MTD_COMPLEX_MAPPINGS && GPIOLIB + depends on SND + select I2C_ALGOBIT diff --git a/queue-4.2/debugfs-fix-refcount-imbalance-in-start_creating.patch b/queue-4.2/debugfs-fix-refcount-imbalance-in-start_creating.patch new file mode 100644 index 00000000000..424f7cde0ea --- /dev/null +++ b/queue-4.2/debugfs-fix-refcount-imbalance-in-start_creating.patch @@ -0,0 +1,47 @@ +From 0ee9608c89e81a1ccee52ecb58a7ff040e2522d9 Mon Sep 17 00:00:00 2001 +From: Daniel Borkmann +Date: Thu, 5 Nov 2015 00:01:51 +0100 +Subject: debugfs: fix refcount imbalance in start_creating + +From: Daniel Borkmann + +commit 0ee9608c89e81a1ccee52ecb58a7ff040e2522d9 upstream. + +In debugfs' start_creating(), we pin the file system to safely access +its root. When we failed to create a file, we unpin the file system via +failed_creating() to release the mount count and eventually the reference +of the vfsmount. + +However, when we run into an error during lookup_one_len() when still +in start_creating(), we only release the parent's mutex but not so the +reference on the mount. Looks like it was done in the past, but after +splitting portions of __create_file() into start_creating() and +end_creating() via 190afd81e4a5 ("debugfs: split the beginning and the +end of __create_file() off"), this seemed missed. Noticed during code +review. + +Fixes: 190afd81e4a5 ("debugfs: split the beginning and the end of __create_file() off") +Signed-off-by: Daniel Borkmann +Signed-off-by: Al Viro +Signed-off-by: Greg Kroah-Hartman + +--- + fs/debugfs/inode.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/fs/debugfs/inode.c ++++ b/fs/debugfs/inode.c +@@ -271,8 +271,12 @@ static struct dentry *start_creating(con + dput(dentry); + dentry = ERR_PTR(-EEXIST); + } +- if (IS_ERR(dentry)) ++ ++ if (IS_ERR(dentry)) { + mutex_unlock(&d_inode(parent)->i_mutex); ++ simple_release_fs(&debugfs_mount, &debugfs_mount_count); ++ } ++ + return dentry; + } + diff --git a/queue-4.2/ext4-crypto-fix-bugs-in-ext4_encrypted_zeroout.patch b/queue-4.2/ext4-crypto-fix-bugs-in-ext4_encrypted_zeroout.patch new file mode 100644 index 00000000000..09eb7405fac --- /dev/null +++ b/queue-4.2/ext4-crypto-fix-bugs-in-ext4_encrypted_zeroout.patch @@ -0,0 +1,90 @@ +From 36086d43f6575c081067de9855786a2fc91df77b Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Sat, 3 Oct 2015 10:49:29 -0400 +Subject: ext4 crypto: fix bugs in ext4_encrypted_zeroout() + +From: Theodore Ts'o + +commit 36086d43f6575c081067de9855786a2fc91df77b upstream. + +Fix multiple bugs in ext4_encrypted_zeroout(), including one that +could cause us to write an encrypted zero page to the wrong location +on disk, potentially causing data and file system corruption. +Fortunately, this tends to only show up in stress tests, but even with +these fixes, we are seeing some test failures with generic/127 --- but +these are now caused by data failures instead of metadata corruption. + +Since ext4_encrypted_zeroout() is only used for some optimizations to +keep the extent tree from being too fragmented, and +ext4_encrypted_zeroout() itself isn't all that optimized from a time +or IOPS perspective, disable the extent tree optimization for +encrypted inodes for now. This prevents the data corruption issues +reported by generic/127 until we can figure out what's going wrong. + +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/crypto.c | 23 +++++++++++++++++++---- + fs/ext4/extents.c | 3 +++ + 2 files changed, 22 insertions(+), 4 deletions(-) + +--- a/fs/ext4/crypto.c ++++ b/fs/ext4/crypto.c +@@ -410,7 +410,13 @@ int ext4_encrypted_zeroout(struct inode + ext4_lblk_t lblk = ex->ee_block; + ext4_fsblk_t pblk = ext4_ext_pblock(ex); + unsigned int len = ext4_ext_get_actual_len(ex); +- int err = 0; ++ int ret, err = 0; ++ ++#if 0 ++ ext4_msg(inode->i_sb, KERN_CRIT, ++ "ext4_encrypted_zeroout ino %lu lblk %u len %u", ++ (unsigned long) inode->i_ino, lblk, len); ++#endif + + BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE); + +@@ -436,17 +442,26 @@ int ext4_encrypted_zeroout(struct inode + goto errout; + } + bio->bi_bdev = inode->i_sb->s_bdev; +- bio->bi_iter.bi_sector = pblk; +- err = bio_add_page(bio, ciphertext_page, ++ bio->bi_iter.bi_sector = ++ pblk << (inode->i_sb->s_blocksize_bits - 9); ++ ret = bio_add_page(bio, ciphertext_page, + inode->i_sb->s_blocksize, 0); +- if (err) { ++ if (ret != inode->i_sb->s_blocksize) { ++ /* should never happen! */ ++ ext4_msg(inode->i_sb, KERN_ERR, ++ "bio_add_page failed: %d", ret); ++ WARN_ON(1); + bio_put(bio); ++ err = -EIO; + goto errout; + } + err = submit_bio_wait(WRITE, bio); ++ if ((err == 0) && bio->bi_error) ++ err = -EIO; + bio_put(bio); + if (err) + goto errout; ++ lblk++; pblk++; + } + err = 0; + errout: +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -3558,6 +3558,9 @@ static int ext4_ext_convert_to_initializ + max_zeroout = sbi->s_extent_max_zeroout_kb >> + (inode->i_sb->s_blocksize_bits - 10); + ++ if (ext4_encrypted_inode(inode)) ++ max_zeroout = 0; ++ + /* If extent is less than s_max_zeroout_kb, zeroout directly */ + if (max_zeroout && (ee_len <= max_zeroout)) { + err = ext4_ext_zeroout(inode, ex); diff --git a/queue-4.2/ext4-crypto-fix-memory-leak-in-ext4_bio_write_page.patch b/queue-4.2/ext4-crypto-fix-memory-leak-in-ext4_bio_write_page.patch new file mode 100644 index 00000000000..025375982fd --- /dev/null +++ b/queue-4.2/ext4-crypto-fix-memory-leak-in-ext4_bio_write_page.patch @@ -0,0 +1,51 @@ +From 937d7b84dca58f2565715f2c8e52f14c3d65fb22 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Fri, 2 Oct 2015 23:54:58 -0400 +Subject: ext4 crypto: fix memory leak in ext4_bio_write_page() + +From: Theodore Ts'o + +commit 937d7b84dca58f2565715f2c8e52f14c3d65fb22 upstream. + +There are times when ext4_bio_write_page() is called even though we +don't actually need to do any I/O. This happens when ext4_writepage() +gets called by the jbd2 commit path when an inode needs to force its +pages written out in order to provide data=ordered guarantees --- and +a page is backed by an unwritten (e.g., uninitialized) block on disk, +or if delayed allocation means the page's backing store hasn't been +allocated yet. In that case, we need to skip the call to +ext4_encrypt_page(), since in addition to wasting CPU, it leads to a +bounce page and an ext4 crypto context getting leaked. + +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/page-io.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/fs/ext4/page-io.c ++++ b/fs/ext4/page-io.c +@@ -425,6 +425,7 @@ int ext4_bio_write_page(struct ext4_io_s + struct buffer_head *bh, *head; + int ret = 0; + int nr_submitted = 0; ++ int nr_to_submit = 0; + + blocksize = 1 << inode->i_blkbits; + +@@ -477,11 +478,13 @@ int ext4_bio_write_page(struct ext4_io_s + unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + } + set_buffer_async_write(bh); ++ nr_to_submit++; + } while ((bh = bh->b_this_page) != head); + + bh = head = page_buffers(page); + +- if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { ++ if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) && ++ nr_to_submit) { + data_page = ext4_encrypt(inode, page); + if (IS_ERR(data_page)) { + ret = PTR_ERR(data_page); diff --git a/queue-4.2/ext4-crypto-replace-some-bug_on-s-with-error-checks.patch b/queue-4.2/ext4-crypto-replace-some-bug_on-s-with-error-checks.patch new file mode 100644 index 00000000000..1fe06cd5ef2 --- /dev/null +++ b/queue-4.2/ext4-crypto-replace-some-bug_on-s-with-error-checks.patch @@ -0,0 +1,101 @@ +From 687c3c36e754a999a8263745b27965128db4fee5 Mon Sep 17 00:00:00 2001 +From: Theodore Ts'o +Date: Sat, 3 Oct 2015 10:49:27 -0400 +Subject: ext4 crypto: replace some BUG_ON()'s with error checks + +From: Theodore Ts'o + +commit 687c3c36e754a999a8263745b27965128db4fee5 upstream. + +Buggy (or hostile) userspace should not be able to cause the kernel to +crash. + +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/crypto.c | 1 - + fs/ext4/crypto_fname.c | 2 -- + fs/ext4/crypto_key.c | 16 +++++++++++++--- + fs/ext4/crypto_policy.c | 3 ++- + 4 files changed, 15 insertions(+), 7 deletions(-) + +--- a/fs/ext4/crypto.c ++++ b/fs/ext4/crypto.c +@@ -296,7 +296,6 @@ static int ext4_page_crypto(struct ext4_ + else + res = crypto_ablkcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { +- BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } +--- a/fs/ext4/crypto_fname.c ++++ b/fs/ext4/crypto_fname.c +@@ -121,7 +121,6 @@ static int ext4_fname_encrypt(struct ino + ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); + res = crypto_ablkcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { +- BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } +@@ -183,7 +182,6 @@ static int ext4_fname_decrypt(struct ino + ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); + res = crypto_ablkcipher_decrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { +- BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } +--- a/fs/ext4/crypto_key.c ++++ b/fs/ext4/crypto_key.c +@@ -71,7 +71,6 @@ static int ext4_derive_key_aes(char deri + EXT4_AES_256_XTS_KEY_SIZE, NULL); + res = crypto_ablkcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { +- BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } +@@ -208,7 +207,12 @@ retry: + goto out; + } + crypt_info->ci_keyring_key = keyring_key; +- BUG_ON(keyring_key->type != &key_type_logon); ++ if (keyring_key->type != &key_type_logon) { ++ printk_once(KERN_WARNING ++ "ext4: key type must be logon\n"); ++ res = -ENOKEY; ++ goto out; ++ } + ukp = ((struct user_key_payload *)keyring_key->payload.data); + if (ukp->datalen != sizeof(struct ext4_encryption_key)) { + res = -EINVAL; +@@ -217,7 +221,13 @@ retry: + master_key = (struct ext4_encryption_key *)ukp->data; + BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE != + EXT4_KEY_DERIVATION_NONCE_SIZE); +- BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE); ++ if (master_key->size != EXT4_AES_256_XTS_KEY_SIZE) { ++ printk_once(KERN_WARNING ++ "ext4: key size incorrect: %d\n", ++ master_key->size); ++ res = -ENOKEY; ++ goto out; ++ } + res = ext4_derive_key_aes(ctx.nonce, master_key->raw, + raw_key); + got_key: +--- a/fs/ext4/crypto_policy.c ++++ b/fs/ext4/crypto_policy.c +@@ -137,7 +137,8 @@ int ext4_is_child_context_consistent_wit + + if ((parent == NULL) || (child == NULL)) { + pr_err("parent %p child %p\n", parent, child); +- BUG_ON(1); ++ WARN_ON(1); /* Should never happen */ ++ return 0; + } + /* no restrictions if the parent directory is not encrypted */ + if (!ext4_encrypted_inode(parent)) diff --git a/queue-4.2/ext4-fix-potential-use-after-free-in-__ext4_journal_stop.patch b/queue-4.2/ext4-fix-potential-use-after-free-in-__ext4_journal_stop.patch new file mode 100644 index 00000000000..a427ddc7a20 --- /dev/null +++ b/queue-4.2/ext4-fix-potential-use-after-free-in-__ext4_journal_stop.patch @@ -0,0 +1,44 @@ +From 6934da9238da947628be83635e365df41064b09b Mon Sep 17 00:00:00 2001 +From: Lukas Czerner +Date: Sat, 17 Oct 2015 22:57:06 -0400 +Subject: ext4: fix potential use after free in __ext4_journal_stop + +From: Lukas Czerner + +commit 6934da9238da947628be83635e365df41064b09b upstream. + +There is a use-after-free possibility in __ext4_journal_stop() in the +case that we free the handle in the first jbd2_journal_stop() because +we're referencing handle->h_err afterwards. This was introduced in +9705acd63b125dee8b15c705216d7186daea4625 and it is wrong. Fix it by +storing the handle->h_err value beforehand and avoid referencing +potentially freed handle. + +Fixes: 9705acd63b125dee8b15c705216d7186daea4625 +Signed-off-by: Lukas Czerner +Reviewed-by: Andreas Dilger +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/ext4_jbd2.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/fs/ext4/ext4_jbd2.c ++++ b/fs/ext4/ext4_jbd2.c +@@ -88,13 +88,13 @@ int __ext4_journal_stop(const char *wher + return 0; + } + ++ err = handle->h_err; + if (!handle->h_transaction) { +- err = jbd2_journal_stop(handle); +- return handle->h_err ? handle->h_err : err; ++ rc = jbd2_journal_stop(handle); ++ return err ? err : rc; + } + + sb = handle->h_transaction->t_journal->j_private; +- err = handle->h_err; + rc = jbd2_journal_stop(handle); + + if (!err) diff --git a/queue-4.2/ext4-jbd2-ensure-entering-into-panic-after-recording-an-error-in-superblock.patch b/queue-4.2/ext4-jbd2-ensure-entering-into-panic-after-recording-an-error-in-superblock.patch new file mode 100644 index 00000000000..40141e03630 --- /dev/null +++ b/queue-4.2/ext4-jbd2-ensure-entering-into-panic-after-recording-an-error-in-superblock.patch @@ -0,0 +1,104 @@ +From 4327ba52afd03fc4b5afa0ee1d774c9c5b0e85c5 Mon Sep 17 00:00:00 2001 +From: Daeho Jeong +Date: Sun, 18 Oct 2015 17:02:56 -0400 +Subject: ext4, jbd2: ensure entering into panic after recording an error in superblock + +From: Daeho Jeong + +commit 4327ba52afd03fc4b5afa0ee1d774c9c5b0e85c5 upstream. + +If a EXT4 filesystem utilizes JBD2 journaling and an error occurs, the +journaling will be aborted first and the error number will be recorded +into JBD2 superblock and, finally, the system will enter into the +panic state in "errors=panic" option. But, in the rare case, this +sequence is little twisted like the below figure and it will happen +that the system enters into panic state, which means the system reset +in mobile environment, before completion of recording an error in the +journal superblock. In this case, e2fsck cannot recognize that the +filesystem failure occurred in the previous run and the corruption +wouldn't be fixed. + +Task A Task B +ext4_handle_error() +-> jbd2_journal_abort() + -> __journal_abort_soft() + -> __jbd2_journal_abort_hard() + | -> journal->j_flags |= JBD2_ABORT; + | + | __ext4_abort() + | -> jbd2_journal_abort() + | | -> __journal_abort_soft() + | | -> if (journal->j_flags & JBD2_ABORT) + | | return; + | -> panic() + | + -> jbd2_journal_update_sb_errno() + +Tested-by: Hobin Woo +Signed-off-by: Daeho Jeong +Signed-off-by: Theodore Ts'o +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ext4/super.c | 12 ++++++++++-- + fs/jbd2/journal.c | 6 +++++- + include/linux/jbd2.h | 1 + + 3 files changed, 16 insertions(+), 3 deletions(-) + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -397,9 +397,13 @@ static void ext4_handle_error(struct sup + smp_wmb(); + sb->s_flags |= MS_RDONLY; + } +- if (test_opt(sb, ERRORS_PANIC)) ++ if (test_opt(sb, ERRORS_PANIC)) { ++ if (EXT4_SB(sb)->s_journal && ++ !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR)) ++ return; + panic("EXT4-fs (device %s): panic forced after error\n", + sb->s_id); ++ } + } + + #define ext4_error_ratelimit(sb) \ +@@ -588,8 +592,12 @@ void __ext4_abort(struct super_block *sb + jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); + save_error_info(sb, function, line); + } +- if (test_opt(sb, ERRORS_PANIC)) ++ if (test_opt(sb, ERRORS_PANIC)) { ++ if (EXT4_SB(sb)->s_journal && ++ !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR)) ++ return; + panic("EXT4-fs panic from previous error\n"); ++ } + } + + void __ext4_msg(struct super_block *sb, +--- a/fs/jbd2/journal.c ++++ b/fs/jbd2/journal.c +@@ -2071,8 +2071,12 @@ static void __journal_abort_soft (journa + + __jbd2_journal_abort_hard(journal); + +- if (errno) ++ if (errno) { + jbd2_journal_update_sb_errno(journal); ++ write_lock(&journal->j_state_lock); ++ journal->j_flags |= JBD2_REC_ERR; ++ write_unlock(&journal->j_state_lock); ++ } + } + + /** +--- a/include/linux/jbd2.h ++++ b/include/linux/jbd2.h +@@ -1007,6 +1007,7 @@ struct journal_s + #define JBD2_ABORT_ON_SYNCDATA_ERR 0x040 /* Abort the journal on file + * data write error in ordered + * mode */ ++#define JBD2_REC_ERR 0x080 /* The errno in the sb has been recorded */ + + /* + * Function declarations for the journaling transaction and buffer diff --git a/queue-4.2/firewire-ohci-fix-jmicron-jmb38x-it-context-discovery.patch b/queue-4.2/firewire-ohci-fix-jmicron-jmb38x-it-context-discovery.patch new file mode 100644 index 00000000000..a034d06b8b0 --- /dev/null +++ b/queue-4.2/firewire-ohci-fix-jmicron-jmb38x-it-context-discovery.patch @@ -0,0 +1,71 @@ +From 100ceb66d5c40cc0c7018e06a9474302470be73c Mon Sep 17 00:00:00 2001 +From: Stefan Richter +Date: Tue, 3 Nov 2015 01:46:21 +0100 +Subject: firewire: ohci: fix JMicron JMB38x IT context discovery + +From: Stefan Richter + +commit 100ceb66d5c40cc0c7018e06a9474302470be73c upstream. + +Reported by Clifford and Craig for JMicron OHCI-1394 + SDHCI combo +controllers: Often or even most of the time, the controller is +initialized with the message "added OHCI v1.10 device as card 0, 4 IR + +0 IT contexts, quirks 0x10". With 0 isochronous transmit DMA contexts +(IT contexts), applications like audio output are impossible. + +However, OHCI-1394 demands that at least 4 IT contexts are implemented +by the link layer controller, and indeed JMicron JMB38x do implement +four of them. Only their IsoXmitIntMask register is unreliable at early +access. + +With my own JMB381 single function controller I found: + - I can reproduce the problem with a lower probability than Craig's. + - If I put a loop around the section which clears and reads + IsoXmitIntMask, then either the first or the second attempt will + return the correct initial mask of 0x0000000f. I never encountered + a case of needing more than a second attempt. + - Consequently, if I put a dummy reg_read(...IsoXmitIntMaskSet) + before the first write, the subsequent read will return the correct + result. + - If I merely ignore a wrong read result and force the known real + result, later isochronous transmit DMA usage works just fine. + +So let's just fix this chip bug up by the latter method. Tested with +JMB381 on kernel 3.13 and 4.3. + +Since OHCI-1394 generally requires 4 IT contexts at a minium, this +workaround is simply applied whenever the initial read of IsoXmitIntMask +returns 0, regardless whether it's a JMicron chip or not. I never heard +of this issue together with any other chip though. + +I am not 100% sure that this fix works on the OHCI-1394 part of JMB380 +and JMB388 combo controllers exactly the same as on the JMB381 single- +function controller, but so far I haven't had a chance to let an owner +of a combo chip run a patched kernel. + +Strangely enough, IsoRecvIntMask is always reported correctly, even +though it is probed right before IsoXmitIntMask. + +Reported-by: Clifford Dunn +Reported-by: Craig Moore +Signed-off-by: Stefan Richter +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/firewire/ohci.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/drivers/firewire/ohci.c ++++ b/drivers/firewire/ohci.c +@@ -3675,6 +3675,11 @@ static int pci_probe(struct pci_dev *dev + + reg_write(ohci, OHCI1394_IsoXmitIntMaskSet, ~0); + ohci->it_context_support = reg_read(ohci, OHCI1394_IsoXmitIntMaskSet); ++ /* JMicron JMB38x often shows 0 at first read, just ignore it */ ++ if (!ohci->it_context_support) { ++ ohci_notice(ohci, "overriding IsoXmitIntMask\n"); ++ ohci->it_context_support = 0xf; ++ } + reg_write(ohci, OHCI1394_IsoXmitIntMaskClear, ~0); + ohci->it_context_mask = ohci->it_context_support; + ohci->n_it = hweight32(ohci->it_context_mask); diff --git a/queue-4.2/nfs-if-we-have-no-valid-attrs-then-don-t-declare-the-attribute-cache-valid.patch b/queue-4.2/nfs-if-we-have-no-valid-attrs-then-don-t-declare-the-attribute-cache-valid.patch new file mode 100644 index 00000000000..b53c8038842 --- /dev/null +++ b/queue-4.2/nfs-if-we-have-no-valid-attrs-then-don-t-declare-the-attribute-cache-valid.patch @@ -0,0 +1,39 @@ +From c812012f9ca7cf89c9e1a1cd512e6c3b5be04b85 Mon Sep 17 00:00:00 2001 +From: Jeff Layton +Date: Wed, 25 Nov 2015 13:50:11 -0500 +Subject: nfs: if we have no valid attrs, then don't declare the attribute cache valid + +From: Jeff Layton + +commit c812012f9ca7cf89c9e1a1cd512e6c3b5be04b85 upstream. + +If we pass in an empty nfs_fattr struct to nfs_update_inode, it will +(correctly) not update any of the attributes, but it then clears the +NFS_INO_INVALID_ATTR flag, which indicates that the attributes are +up to date. Don't clear the flag if the fattr struct has no valid +attrs to apply. + +Reviewed-by: Steve French +Signed-off-by: Jeff Layton +Signed-off-by: Trond Myklebust +Signed-off-by: Greg Kroah-Hartman + +--- + fs/nfs/inode.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/fs/nfs/inode.c ++++ b/fs/nfs/inode.c +@@ -1816,7 +1816,11 @@ static int nfs_update_inode(struct inode + if ((long)fattr->gencount - (long)nfsi->attr_gencount > 0) + nfsi->attr_gencount = fattr->gencount; + } +- invalid &= ~NFS_INO_INVALID_ATTR; ++ ++ /* Don't declare attrcache up to date if there were no attrs! */ ++ if (fattr->valid != 0) ++ invalid &= ~NFS_INO_INVALID_ATTR; ++ + /* Don't invalidate the data if we were to blame */ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) + || S_ISLNK(inode->i_mode))) diff --git a/queue-4.2/nfs4-limit-callback-decoding-to-received-bytes.patch b/queue-4.2/nfs4-limit-callback-decoding-to-received-bytes.patch new file mode 100644 index 00000000000..bddb1c87e33 --- /dev/null +++ b/queue-4.2/nfs4-limit-callback-decoding-to-received-bytes.patch @@ -0,0 +1,97 @@ +From 38b7631fbe42e6e247e9fc9879f961b14a687e3b Mon Sep 17 00:00:00 2001 +From: Benjamin Coddington +Date: Fri, 20 Nov 2015 09:55:30 -0500 +Subject: nfs4: limit callback decoding to received bytes + +From: Benjamin Coddington + +commit 38b7631fbe42e6e247e9fc9879f961b14a687e3b upstream. + +A truncated cb_compound request will cause the client to decode null or +data from a previous callback for nfs4.1 backchannel case, or uninitialized +data for the nfs4.0 case. This is because the path through +svc_process_common() advances the request's iov_base and decrements iov_len +without adjusting the overall xdr_buf's len field. That causes +xdr_init_decode() to set up the xdr_stream with an incorrect length in +nfs4_callback_compound(). + +Fixing this for the nfs4.1 backchannel case first requires setting the +correct iov_len and page_len based on the length of received data in the +same manner as the nfs4.0 case. + +Then the request's xdr_buf length can be adjusted for both cases based upon +the remaining iov_len and page_len. + +Signed-off-by: Benjamin Coddington +Signed-off-by: Trond Myklebust +Signed-off-by: Greg Kroah-Hartman + +--- + fs/nfs/callback_xdr.c | 7 +++++-- + net/sunrpc/backchannel_rqst.c | 8 ++++++++ + net/sunrpc/svc.c | 1 + + 3 files changed, 14 insertions(+), 2 deletions(-) + +--- a/fs/nfs/callback_xdr.c ++++ b/fs/nfs/callback_xdr.c +@@ -76,7 +76,8 @@ static __be32 *read_buf(struct xdr_strea + + p = xdr_inline_decode(xdr, nbytes); + if (unlikely(p == NULL)) +- printk(KERN_WARNING "NFS: NFSv4 callback reply buffer overflowed!\n"); ++ printk(KERN_WARNING "NFS: NFSv4 callback reply buffer overflowed " ++ "or truncated request.\n"); + return p; + } + +@@ -892,6 +893,7 @@ static __be32 nfs4_callback_compound(str + struct cb_compound_hdr_arg hdr_arg = { 0 }; + struct cb_compound_hdr_res hdr_res = { NULL }; + struct xdr_stream xdr_in, xdr_out; ++ struct xdr_buf *rq_arg = &rqstp->rq_arg; + __be32 *p, status; + struct cb_process_state cps = { + .drc_status = 0, +@@ -903,7 +905,8 @@ static __be32 nfs4_callback_compound(str + + dprintk("%s: start\n", __func__); + +- xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base); ++ rq_arg->len = rq_arg->head[0].iov_len + rq_arg->page_len; ++ xdr_init_decode(&xdr_in, rq_arg, rq_arg->head[0].iov_base); + + p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len); + xdr_init_encode(&xdr_out, &rqstp->rq_res, p); +--- a/net/sunrpc/backchannel_rqst.c ++++ b/net/sunrpc/backchannel_rqst.c +@@ -333,12 +333,20 @@ void xprt_complete_bc_request(struct rpc + { + struct rpc_xprt *xprt = req->rq_xprt; + struct svc_serv *bc_serv = xprt->bc_serv; ++ struct xdr_buf *rq_rcv_buf = &req->rq_rcv_buf; + + spin_lock(&xprt->bc_pa_lock); + list_del(&req->rq_bc_pa_list); + xprt_dec_alloc_count(xprt, 1); + spin_unlock(&xprt->bc_pa_lock); + ++ if (copied <= rq_rcv_buf->head[0].iov_len) { ++ rq_rcv_buf->head[0].iov_len = copied; ++ rq_rcv_buf->page_len = 0; ++ } else { ++ rq_rcv_buf->page_len = copied - rq_rcv_buf->head[0].iov_len; ++ } ++ + req->rq_private_buf.len = copied; + set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); + +--- a/net/sunrpc/svc.c ++++ b/net/sunrpc/svc.c +@@ -1366,6 +1366,7 @@ bc_svc_process(struct svc_serv *serv, st + memcpy(&rqstp->rq_addr, &req->rq_xprt->addr, rqstp->rq_addrlen); + memcpy(&rqstp->rq_arg, &req->rq_rcv_buf, sizeof(rqstp->rq_arg)); + memcpy(&rqstp->rq_res, &req->rq_snd_buf, sizeof(rqstp->rq_res)); ++ rqstp->rq_arg.len = req->rq_private_buf.len; + + /* reset result send buffer "put" position */ + resv->iov_len = 0; diff --git a/queue-4.2/nfs4-start-callback_ident-at-idr-1.patch b/queue-4.2/nfs4-start-callback_ident-at-idr-1.patch new file mode 100644 index 00000000000..c9ef3180ecd --- /dev/null +++ b/queue-4.2/nfs4-start-callback_ident-at-idr-1.patch @@ -0,0 +1,33 @@ +From c68a027c05709330fe5b2f50c50d5fa02124b5d8 Mon Sep 17 00:00:00 2001 +From: Benjamin Coddington +Date: Fri, 20 Nov 2015 09:56:20 -0500 +Subject: nfs4: start callback_ident at idr 1 + +From: Benjamin Coddington + +commit c68a027c05709330fe5b2f50c50d5fa02124b5d8 upstream. + +If clp->cl_cb_ident is zero, then nfs_cb_idr_remove_locked() skips removing +it when the nfs_client is freed. A decoding or server bug can then find +and try to put that first nfs_client which would lead to a crash. + +Signed-off-by: Benjamin Coddington +Fixes: d6870312659d ("nfs4client: convert to idr_alloc()") +Signed-off-by: Trond Myklebust +Signed-off-by: Greg Kroah-Hartman + +--- + fs/nfs/nfs4client.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/nfs/nfs4client.c ++++ b/fs/nfs/nfs4client.c +@@ -33,7 +33,7 @@ static int nfs_get_cb_ident_idr(struct n + return ret; + idr_preload(GFP_KERNEL); + spin_lock(&nn->nfs_client_lock); +- ret = idr_alloc(&nn->cb_ident_idr, clp, 0, 0, GFP_NOWAIT); ++ ret = idr_alloc(&nn->cb_ident_idr, clp, 1, 0, GFP_NOWAIT); + if (ret >= 0) + clp->cl_cb_ident = ret; + spin_unlock(&nn->nfs_client_lock); diff --git a/queue-4.2/nfsd-eliminate-sending-duplicate-and-repeated-delegations.patch b/queue-4.2/nfsd-eliminate-sending-duplicate-and-repeated-delegations.patch new file mode 100644 index 00000000000..3bd87c7db13 --- /dev/null +++ b/queue-4.2/nfsd-eliminate-sending-duplicate-and-repeated-delegations.patch @@ -0,0 +1,200 @@ +From 34ed9872e745fa56f10e9bef2cf3d2336c6c8816 Mon Sep 17 00:00:00 2001 +From: Andrew Elble +Date: Thu, 15 Oct 2015 12:07:28 -0400 +Subject: nfsd: eliminate sending duplicate and repeated delegations + +From: Andrew Elble + +commit 34ed9872e745fa56f10e9bef2cf3d2336c6c8816 upstream. + +We've observed the nfsd server in a state where there are +multiple delegations on the same nfs4_file for the same client. +The nfs client does attempt to DELEGRETURN these when they are presented to +it - but apparently under some (unknown) circumstances the client does not +manage to return all of them. This leads to the eventual +attempt to CB_RECALL more than one delegation with the same nfs +filehandle to the same client. The first recall will succeed, but the +next recall will fail with NFS4ERR_BADHANDLE. This leads to the server +having delegations on cl_revoked that the client has no way to FREE +or DELEGRETURN, with resulting inability to recover. The state manager +on the server will continually assert SEQ4_STATUS_RECALLABLE_STATE_REVOKED, +and the state manager on the client will be looping unable to satisfy +the server. + +List discussion also reports a race between OPEN and DELEGRETURN that +will be avoided by only sending the delegation once to the +client. This is also logically in accordance with RFC5561 9.1.1 and 10.2. + +So, let's: + +1.) Not hand out duplicate delegations. +2.) Only send them to the client once. + +RFC 5561: + +9.1.1: +"Delegations and layouts, on the other hand, are not associated with a +specific owner but are associated with the client as a whole +(identified by a client ID)." + +10.2: +"...the stateid for a delegation is associated with a client ID and may be +used on behalf of all the open-owners for the given client. A +delegation is made to the client as a whole and not to any specific +process or thread of control within it." + +Reported-by: Eric Meddaugh +Cc: Trond Myklebust +Cc: Olga Kornievskaia +Signed-off-by: Andrew Elble +Signed-off-by: J. Bruce Fields +Signed-off-by: Greg Kroah-Hartman + +--- + fs/nfsd/nfs4state.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++------ + 1 file changed, 84 insertions(+), 10 deletions(-) + +--- a/fs/nfsd/nfs4state.c ++++ b/fs/nfsd/nfs4state.c +@@ -765,16 +765,68 @@ void nfs4_unhash_stid(struct nfs4_stid * + s->sc_type = 0; + } + +-static void ++/** ++ * nfs4_get_existing_delegation - Discover if this delegation already exists ++ * @clp: a pointer to the nfs4_client we're granting a delegation to ++ * @fp: a pointer to the nfs4_file we're granting a delegation on ++ * ++ * Return: ++ * On success: NULL if an existing delegation was not found. ++ * ++ * On error: -EAGAIN if one was previously granted to this nfs4_client ++ * for this nfs4_file. ++ * ++ */ ++ ++static int ++nfs4_get_existing_delegation(struct nfs4_client *clp, struct nfs4_file *fp) ++{ ++ struct nfs4_delegation *searchdp = NULL; ++ struct nfs4_client *searchclp = NULL; ++ ++ lockdep_assert_held(&state_lock); ++ lockdep_assert_held(&fp->fi_lock); ++ ++ list_for_each_entry(searchdp, &fp->fi_delegations, dl_perfile) { ++ searchclp = searchdp->dl_stid.sc_client; ++ if (clp == searchclp) { ++ return -EAGAIN; ++ } ++ } ++ return 0; ++} ++ ++/** ++ * hash_delegation_locked - Add a delegation to the appropriate lists ++ * @dp: a pointer to the nfs4_delegation we are adding. ++ * @fp: a pointer to the nfs4_file we're granting a delegation on ++ * ++ * Return: ++ * On success: NULL if the delegation was successfully hashed. ++ * ++ * On error: -EAGAIN if one was previously granted to this ++ * nfs4_client for this nfs4_file. Delegation is not hashed. ++ * ++ */ ++ ++static int + hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) + { ++ int status; ++ struct nfs4_client *clp = dp->dl_stid.sc_client; ++ + lockdep_assert_held(&state_lock); + lockdep_assert_held(&fp->fi_lock); + ++ status = nfs4_get_existing_delegation(clp, fp); ++ if (status) ++ return status; ++ ++fp->fi_delegees; + atomic_inc(&dp->dl_stid.sc_count); + dp->dl_stid.sc_type = NFS4_DELEG_STID; + list_add(&dp->dl_perfile, &fp->fi_delegations); +- list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations); ++ list_add(&dp->dl_perclnt, &clp->cl_delegations); ++ return 0; + } + + static bool +@@ -3940,6 +3992,18 @@ static struct file_lock *nfs4_alloc_init + return fl; + } + ++/** ++ * nfs4_setlease - Obtain a delegation by requesting lease from vfs layer ++ * @dp: a pointer to the nfs4_delegation we're adding. ++ * ++ * Return: ++ * On success: Return code will be 0 on success. ++ * ++ * On error: -EAGAIN if there was an existing delegation. ++ * nonzero if there is an error in other cases. ++ * ++ */ ++ + static int nfs4_setlease(struct nfs4_delegation *dp) + { + struct nfs4_file *fp = dp->dl_stid.sc_file; +@@ -3971,16 +4035,19 @@ static int nfs4_setlease(struct nfs4_del + goto out_unlock; + /* Race breaker */ + if (fp->fi_deleg_file) { +- status = 0; +- ++fp->fi_delegees; +- hash_delegation_locked(dp, fp); ++ status = hash_delegation_locked(dp, fp); + goto out_unlock; + } + fp->fi_deleg_file = filp; +- fp->fi_delegees = 1; +- hash_delegation_locked(dp, fp); ++ fp->fi_delegees = 0; ++ status = hash_delegation_locked(dp, fp); + spin_unlock(&fp->fi_lock); + spin_unlock(&state_lock); ++ if (status) { ++ /* Should never happen, this is a new fi_deleg_file */ ++ WARN_ON_ONCE(1); ++ goto out_fput; ++ } + return 0; + out_unlock: + spin_unlock(&fp->fi_lock); +@@ -4000,6 +4067,15 @@ nfs4_set_delegation(struct nfs4_client * + if (fp->fi_had_conflict) + return ERR_PTR(-EAGAIN); + ++ spin_lock(&state_lock); ++ spin_lock(&fp->fi_lock); ++ status = nfs4_get_existing_delegation(clp, fp); ++ spin_unlock(&fp->fi_lock); ++ spin_unlock(&state_lock); ++ ++ if (status) ++ return ERR_PTR(status); ++ + dp = alloc_init_deleg(clp, fh, odstate); + if (!dp) + return ERR_PTR(-ENOMEM); +@@ -4018,9 +4094,7 @@ nfs4_set_delegation(struct nfs4_client * + status = -EAGAIN; + goto out_unlock; + } +- ++fp->fi_delegees; +- hash_delegation_locked(dp, fp); +- status = 0; ++ status = hash_delegation_locked(dp, fp); + out_unlock: + spin_unlock(&fp->fi_lock); + spin_unlock(&state_lock); diff --git a/queue-4.2/nfsd-serialize-state-seqid-morphing-operations.patch b/queue-4.2/nfsd-serialize-state-seqid-morphing-operations.patch new file mode 100644 index 00000000000..06fea68deca --- /dev/null +++ b/queue-4.2/nfsd-serialize-state-seqid-morphing-operations.patch @@ -0,0 +1,207 @@ +From 35a92fe8770ce54c5eb275cd76128645bea2d200 Mon Sep 17 00:00:00 2001 +From: Jeff Layton +Date: Thu, 17 Sep 2015 07:47:08 -0400 +Subject: nfsd: serialize state seqid morphing operations + +From: Jeff Layton + +commit 35a92fe8770ce54c5eb275cd76128645bea2d200 upstream. + +Andrew was seeing a race occur when an OPEN and OPEN_DOWNGRADE were +running in parallel. The server would receive the OPEN_DOWNGRADE first +and check its seqid, but then an OPEN would race in and bump it. The +OPEN_DOWNGRADE would then complete and bump the seqid again. The result +was that the OPEN_DOWNGRADE would be applied after the OPEN, even though +it should have been rejected since the seqid changed. + +The only recourse we have here I think is to serialize operations that +bump the seqid in a stateid, particularly when we're given a seqid in +the call. To address this, we add a new rw_semaphore to the +nfs4_ol_stateid struct. We do a down_write prior to checking the seqid +after looking up the stateid to ensure that nothing else is going to +bump it while we're operating on it. + +In the case of OPEN, we do a down_read, as the call doesn't contain a +seqid. Those can run in parallel -- we just need to serialize them when +there is a concurrent OPEN_DOWNGRADE or CLOSE. + +LOCK and LOCKU however always take the write lock as there is no +opportunity for parallelizing those. + +Reported-and-Tested-by: Andrew W Elble +Signed-off-by: Jeff Layton +Signed-off-by: J. Bruce Fields +Signed-off-by: Greg Kroah-Hartman + +--- + fs/nfsd/nfs4state.c | 33 ++++++++++++++++++++++++++++----- + fs/nfsd/state.h | 19 ++++++++++--------- + 2 files changed, 38 insertions(+), 14 deletions(-) + +--- a/fs/nfsd/nfs4state.c ++++ b/fs/nfsd/nfs4state.c +@@ -3351,6 +3351,7 @@ static void init_open_stateid(struct nfs + stp->st_access_bmap = 0; + stp->st_deny_bmap = 0; + stp->st_openstp = NULL; ++ init_rwsem(&stp->st_rwsem); + spin_lock(&oo->oo_owner.so_client->cl_lock); + list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids); + spin_lock(&fp->fi_lock); +@@ -4180,15 +4181,20 @@ nfsd4_process_open2(struct svc_rqst *rqs + */ + if (stp) { + /* Stateid was found, this is an OPEN upgrade */ ++ down_read(&stp->st_rwsem); + status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); +- if (status) ++ if (status) { ++ up_read(&stp->st_rwsem); + goto out; ++ } + } else { + stp = open->op_stp; + open->op_stp = NULL; + init_open_stateid(stp, fp, open); ++ down_read(&stp->st_rwsem); + status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open); + if (status) { ++ up_read(&stp->st_rwsem); + release_open_stateid(stp); + goto out; + } +@@ -4200,6 +4206,7 @@ nfsd4_process_open2(struct svc_rqst *rqs + } + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); ++ up_read(&stp->st_rwsem); + + if (nfsd4_has_session(&resp->cstate)) { + if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) { +@@ -4814,10 +4821,13 @@ static __be32 nfs4_seqid_op_checks(struc + * revoked delegations are kept only for free_stateid. + */ + return nfserr_bad_stateid; ++ down_write(&stp->st_rwsem); + status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); +- if (status) +- return status; +- return nfs4_check_fh(current_fh, &stp->st_stid); ++ if (status == nfs_ok) ++ status = nfs4_check_fh(current_fh, &stp->st_stid); ++ if (status != nfs_ok) ++ up_write(&stp->st_rwsem); ++ return status; + } + + /* +@@ -4864,6 +4874,7 @@ static __be32 nfs4_preprocess_confirmed_ + return status; + oo = openowner(stp->st_stateowner); + if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) { ++ up_write(&stp->st_rwsem); + nfs4_put_stid(&stp->st_stid); + return nfserr_bad_stateid; + } +@@ -4894,11 +4905,14 @@ nfsd4_open_confirm(struct svc_rqst *rqst + goto out; + oo = openowner(stp->st_stateowner); + status = nfserr_bad_stateid; +- if (oo->oo_flags & NFS4_OO_CONFIRMED) ++ if (oo->oo_flags & NFS4_OO_CONFIRMED) { ++ up_write(&stp->st_rwsem); + goto put_stateid; ++ } + oo->oo_flags |= NFS4_OO_CONFIRMED; + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); ++ up_write(&stp->st_rwsem); + dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n", + __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid)); + +@@ -4977,6 +4991,7 @@ nfsd4_open_downgrade(struct svc_rqst *rq + memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); + status = nfs_ok; + put_stateid: ++ up_write(&stp->st_rwsem); + nfs4_put_stid(&stp->st_stid); + out: + nfsd4_bump_seqid(cstate, status); +@@ -5030,6 +5045,7 @@ nfsd4_close(struct svc_rqst *rqstp, stru + goto out; + update_stateid(&stp->st_stid.sc_stateid); + memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); ++ up_write(&stp->st_rwsem); + + nfsd4_close_open_stateid(stp); + +@@ -5260,6 +5276,7 @@ init_lock_stateid(struct nfs4_ol_stateid + stp->st_access_bmap = 0; + stp->st_deny_bmap = open_stp->st_deny_bmap; + stp->st_openstp = open_stp; ++ init_rwsem(&stp->st_rwsem); + list_add(&stp->st_locks, &open_stp->st_locks); + list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); + spin_lock(&fp->fi_lock); +@@ -5428,6 +5445,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struc + &open_stp, nn); + if (status) + goto out; ++ up_write(&open_stp->st_rwsem); + open_sop = openowner(open_stp->st_stateowner); + status = nfserr_bad_stateid; + if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid, +@@ -5435,6 +5453,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struc + goto out; + status = lookup_or_create_lock_state(cstate, open_stp, lock, + &lock_stp, &new); ++ if (status == nfs_ok) ++ down_write(&lock_stp->st_rwsem); + } else { + status = nfs4_preprocess_seqid_op(cstate, + lock->lk_old_lock_seqid, +@@ -5540,6 +5560,8 @@ out: + seqid_mutating_err(ntohl(status))) + lock_sop->lo_owner.so_seqid++; + ++ up_write(&lock_stp->st_rwsem); ++ + /* + * If this is a new, never-before-used stateid, and we are + * returning an error, then just go ahead and release it. +@@ -5710,6 +5732,7 @@ nfsd4_locku(struct svc_rqst *rqstp, stru + fput: + fput(filp); + put_stateid: ++ up_write(&stp->st_rwsem); + nfs4_put_stid(&stp->st_stid); + out: + nfsd4_bump_seqid(cstate, status); +--- a/fs/nfsd/state.h ++++ b/fs/nfsd/state.h +@@ -534,15 +534,16 @@ struct nfs4_file { + * Better suggestions welcome. + */ + struct nfs4_ol_stateid { +- struct nfs4_stid st_stid; /* must be first field */ +- struct list_head st_perfile; +- struct list_head st_perstateowner; +- struct list_head st_locks; +- struct nfs4_stateowner * st_stateowner; +- struct nfs4_clnt_odstate * st_clnt_odstate; +- unsigned char st_access_bmap; +- unsigned char st_deny_bmap; +- struct nfs4_ol_stateid * st_openstp; ++ struct nfs4_stid st_stid; ++ struct list_head st_perfile; ++ struct list_head st_perstateowner; ++ struct list_head st_locks; ++ struct nfs4_stateowner *st_stateowner; ++ struct nfs4_clnt_odstate *st_clnt_odstate; ++ unsigned char st_access_bmap; ++ unsigned char st_deny_bmap; ++ struct nfs4_ol_stateid *st_openstp; ++ struct rw_semaphore st_rwsem; + }; + + static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s) diff --git a/queue-4.2/ocfs2-fix-umask-ignored-issue.patch b/queue-4.2/ocfs2-fix-umask-ignored-issue.patch new file mode 100644 index 00000000000..e7564e8b475 --- /dev/null +++ b/queue-4.2/ocfs2-fix-umask-ignored-issue.patch @@ -0,0 +1,36 @@ +From 8f1eb48758aacf6c1ffce18179295adbf3bd7640 Mon Sep 17 00:00:00 2001 +From: Junxiao Bi +Date: Fri, 20 Nov 2015 15:57:30 -0800 +Subject: ocfs2: fix umask ignored issue + +From: Junxiao Bi + +commit 8f1eb48758aacf6c1ffce18179295adbf3bd7640 upstream. + +New created file's mode is not masked with umask, and this makes umask not +work for ocfs2 volume. + +Fixes: 702e5bc ("ocfs2: use generic posix ACL infrastructure") +Signed-off-by: Junxiao Bi +Cc: Gang He +Cc: Mark Fasheh +Cc: Joel Becker +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + fs/ocfs2/namei.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/ocfs2/namei.c ++++ b/fs/ocfs2/namei.c +@@ -365,6 +365,8 @@ static int ocfs2_mknod(struct inode *dir + mlog_errno(status); + goto leave; + } ++ /* update inode->i_mode after mask with "umask". */ ++ inode->i_mode = mode; + + handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb, + S_ISDIR(mode), diff --git a/queue-4.2/rbd-don-t-put-snap_context-twice-in-rbd_queue_workfn.patch b/queue-4.2/rbd-don-t-put-snap_context-twice-in-rbd_queue_workfn.patch new file mode 100644 index 00000000000..f13be700dd9 --- /dev/null +++ b/queue-4.2/rbd-don-t-put-snap_context-twice-in-rbd_queue_workfn.patch @@ -0,0 +1,34 @@ +From 70b16db86f564977df074072143284aec2cb1162 Mon Sep 17 00:00:00 2001 +From: Ilya Dryomov +Date: Fri, 27 Nov 2015 19:23:24 +0100 +Subject: rbd: don't put snap_context twice in rbd_queue_workfn() + +From: Ilya Dryomov + +commit 70b16db86f564977df074072143284aec2cb1162 upstream. + +Commit 4e752f0ab0e8 ("rbd: access snapshot context and mapping size +safely") moved ceph_get_snap_context() out of rbd_img_request_create() +and into rbd_queue_workfn(), adding a ceph_put_snap_context() to the +error path in rbd_queue_workfn(). However, rbd_img_request_create() +consumes a ref on snapc, so calling ceph_put_snap_context() after +a successful rbd_img_request_create() leads to an extra put. Fix it. + +Signed-off-by: Ilya Dryomov +Reviewed-by: Josh Durgin +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/block/rbd.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/block/rbd.c ++++ b/drivers/block/rbd.c +@@ -3439,6 +3439,7 @@ static void rbd_queue_workfn(struct work + goto err_rq; + } + img_request->rq = rq; ++ snapc = NULL; /* img_request consumes a ref */ + + if (op_type == OBJ_OP_DISCARD) + result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA, diff --git a/queue-4.2/series b/queue-4.2/series index 19bb6925004..3dab9360e36 100644 --- a/queue-4.2/series +++ b/queue-4.2/series @@ -35,3 +35,29 @@ net-neighbour-fix-crash-at-dumping-device-agnostic-proxy-entries.patch ipv6-sctp-implement-sctp_v6_destroy_sock.patch net_sched-fix-qdisc_tree_decrease_qlen-races.patch via-rhine-fix-vlan-receive-handling-regression.patch +btrfs-fix-resending-received-snapshot-with-parent.patch +btrfs-check-unsupported-filters-in-balance-arguments.patch +btrfs-fix-file-corruption-and-data-loss-after-cloning-inline-extents.patch +btrfs-fix-truncation-of-compressed-and-inlined-extents.patch +btrfs-fix-regression-when-running-delayed-references.patch +btrfs-fix-race-leading-to-incorrect-item-deletion-when-dropping-extents.patch +btrfs-fix-race-leading-to-bug_on-when-running-delalloc-for-nodatacow.patch +btrfs-fix-race-when-listing-an-inode-s-xattrs.patch +btrfs-fix-signed-overflows-in-btrfs_sync_file.patch +rbd-don-t-put-snap_context-twice-in-rbd_queue_workfn.patch +ext4-crypto-fix-memory-leak-in-ext4_bio_write_page.patch +ext4-crypto-replace-some-bug_on-s-with-error-checks.patch +ext4-crypto-fix-bugs-in-ext4_encrypted_zeroout.patch +ext4-fix-potential-use-after-free-in-__ext4_journal_stop.patch +ext4-jbd2-ensure-entering-into-panic-after-recording-an-error-in-superblock.patch +firewire-ohci-fix-jmicron-jmb38x-it-context-discovery.patch +nfsd-serialize-state-seqid-morphing-operations.patch +nfsd-eliminate-sending-duplicate-and-repeated-delegations.patch +debugfs-fix-refcount-imbalance-in-start_creating.patch +nfs4-limit-callback-decoding-to-received-bytes.patch +nfs4-start-callback_ident-at-idr-1.patch +nfs-if-we-have-no-valid-attrs-then-don-t-declare-the-attribute-cache-valid.patch +ocfs2-fix-umask-ignored-issue.patch +ceph-fix-message-length-computation.patch +alsa-hda-hdmi-apply-skylake-fix-ups-to-broxton-display-codec.patch +cobalt-fix-kconfig-dependency.patch -- 2.47.3