--- /dev/null
+From e2656412f2a7343ecfd13eb74bac0a6e6e9c5aad Mon Sep 17 00:00:00 2001
+From: "Lu, Han" <han.lu@intel.com>
+Date: Wed, 11 Nov 2015 16:54:27 +0800
+Subject: ALSA: hda/hdmi - apply Skylake fix-ups to Broxton display codec
+
+From: "Lu, Han" <han.lu@intel.com>
+
+commit e2656412f2a7343ecfd13eb74bac0a6e6e9c5aad upstream.
+
+Broxton and Skylake have the same behavior on display audio. So this patch
+applys Skylake fix-ups to Broxton.
+
+Signed-off-by: Lu, Han <han.lu@intel.com>
+Signed-off-by: Takashi Iwai <tiwai@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ sound/pci/hda/patch_hdmi.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/sound/pci/hda/patch_hdmi.c
++++ b/sound/pci/hda/patch_hdmi.c
+@@ -48,8 +48,9 @@ MODULE_PARM_DESC(static_hdmi_pcm, "Don't
+ #define is_haswell(codec) ((codec)->core.vendor_id == 0x80862807)
+ #define is_broadwell(codec) ((codec)->core.vendor_id == 0x80862808)
+ #define is_skylake(codec) ((codec)->core.vendor_id == 0x80862809)
++#define is_broxton(codec) ((codec)->core.vendor_id == 0x8086280a)
+ #define is_haswell_plus(codec) (is_haswell(codec) || is_broadwell(codec) \
+- || is_skylake(codec))
++ || is_skylake(codec) || is_broxton(codec))
+
+ #define is_valleyview(codec) ((codec)->core.vendor_id == 0x80862882)
+ #define is_cherryview(codec) ((codec)->core.vendor_id == 0x80862883)
--- /dev/null
+From 849ef9286f30c88113906dc35f44a499c0cb385d Mon Sep 17 00:00:00 2001
+From: David Sterba <dsterba@suse.com>
+Date: Mon, 12 Oct 2015 16:55:54 +0200
+Subject: btrfs: check unsupported filters in balance arguments
+
+From: David Sterba <dsterba@suse.com>
+
+commit 849ef9286f30c88113906dc35f44a499c0cb385d upstream.
+
+We don't verify that all the balance filter arguments supplemented by
+the flags are actually known to the kernel. Thus we let it silently pass
+and do nothing.
+
+At the moment this means only the 'limit' filter, but we're going to add
+a few more soon so it's better to have that fixed. Also in older stable
+kernels so that it works with newer userspace tools.
+
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Chris Mason <clm@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/ioctl.c | 5 +++++
+ fs/btrfs/volumes.h | 8 ++++++++
+ 2 files changed, 13 insertions(+)
+
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -4497,6 +4497,11 @@ locked:
+ goto out_bctl;
+ }
+
++ if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
++ ret = -EINVAL;
++ goto out_bargs;
++ }
++
+ do_balance:
+ /*
+ * Ownership of bctl and mutually_exclusive_operation_running
+--- a/fs/btrfs/volumes.h
++++ b/fs/btrfs/volumes.h
+@@ -380,6 +380,14 @@ struct map_lookup {
+ BTRFS_BALANCE_ARGS_VRANGE | \
+ BTRFS_BALANCE_ARGS_LIMIT)
+
++#define BTRFS_BALANCE_ARGS_MASK \
++ (BTRFS_BALANCE_ARGS_PROFILES | \
++ BTRFS_BALANCE_ARGS_USAGE | \
++ BTRFS_BALANCE_ARGS_DEVID | \
++ BTRFS_BALANCE_ARGS_DRANGE | \
++ BTRFS_BALANCE_ARGS_VRANGE | \
++ BTRFS_BALANCE_ARGS_LIMIT)
++
+ /*
+ * Profile changing flags. When SOFT is set we won't relocate chunk if
+ * it already has the target profile (even though it may be
--- /dev/null
+From 8039d87d9e473aeb740d4fdbd59b9d2f89b2ced9 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 13 Oct 2015 15:15:00 +0100
+Subject: Btrfs: fix file corruption and data loss after cloning inline extents
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 8039d87d9e473aeb740d4fdbd59b9d2f89b2ced9 upstream.
+
+Currently the clone ioctl allows to clone an inline extent from one file
+to another that already has other (non-inlined) extents. This is a problem
+because btrfs is not designed to deal with files having inline and regular
+extents, if a file has an inline extent then it must be the only extent
+in the file and must start at file offset 0. Having a file with an inline
+extent followed by regular extents results in EIO errors when doing reads
+or writes against the first 4K of the file.
+
+Also, the clone ioctl allows one to lose data if the source file consists
+of a single inline extent, with a size of N bytes, and the destination
+file consists of a single inline extent with a size of M bytes, where we
+have M > N. In this case the clone operation removes the inline extent
+from the destination file and then copies the inline extent from the
+source file into the destination file - we lose the M - N bytes from the
+destination file, a read operation will get the value 0x00 for any bytes
+in the the range [N, M] (the destination inode's i_size remained as M,
+that's why we can read past N bytes).
+
+So fix this by not allowing such destructive operations to happen and
+return errno EOPNOTSUPP to user space.
+
+Currently the fstest btrfs/035 tests the data loss case but it totally
+ignores this - i.e. expects the operation to succeed and does not check
+the we got data loss.
+
+The following test case for fstests exercises all these cases that result
+in file corruption and data loss:
+
+ seq=`basename $0`
+ seqres=$RESULT_DIR/$seq
+ echo "QA output created by $seq"
+ tmp=/tmp/$$
+ status=1 # failure is the default!
+ trap "_cleanup; exit \$status" 0 1 2 3 15
+
+ _cleanup()
+ {
+ rm -f $tmp.*
+ }
+
+ # get standard environment, filters and checks
+ . ./common/rc
+ . ./common/filter
+
+ # real QA test starts here
+ _need_to_be_root
+ _supported_fs btrfs
+ _supported_os Linux
+ _require_scratch
+ _require_cloner
+ _require_btrfs_fs_feature "no_holes"
+ _require_btrfs_mkfs_feature "no-holes"
+
+ rm -f $seqres.full
+
+ test_cloning_inline_extents()
+ {
+ local mkfs_opts=$1
+ local mount_opts=$2
+
+ _scratch_mkfs $mkfs_opts >>$seqres.full 2>&1
+ _scratch_mount $mount_opts
+
+ # File bar, the source for all the following clone operations, consists
+ # of a single inline extent (50 bytes).
+ $XFS_IO_PROG -f -c "pwrite -S 0xbb 0 50" $SCRATCH_MNT/bar \
+ | _filter_xfs_io
+
+ # Test cloning into a file with an extent (non-inlined) where the
+ # destination offset overlaps that extent. It should not be possible to
+ # clone the inline extent from file bar into this file.
+ $XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 16K" $SCRATCH_MNT/foo \
+ | _filter_xfs_io
+ $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo
+
+ # Doing IO against any range in the first 4K of the file should work.
+ # Due to a past clone ioctl bug which allowed cloning the inline extent,
+ # these operations resulted in EIO errors.
+ echo "File foo data after clone operation:"
+ # All bytes should have the value 0xaa (clone operation failed and did
+ # not modify our file).
+ od -t x1 $SCRATCH_MNT/foo
+ $XFS_IO_PROG -c "pwrite -S 0xcc 0 100" $SCRATCH_MNT/foo | _filter_xfs_io
+
+ # Test cloning the inline extent against a file which has a hole in its
+ # first 4K followed by a non-inlined extent. It should not be possible
+ # as well to clone the inline extent from file bar into this file.
+ $XFS_IO_PROG -f -c "pwrite -S 0xdd 4K 12K" $SCRATCH_MNT/foo2 \
+ | _filter_xfs_io
+ $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo2
+
+ # Doing IO against any range in the first 4K of the file should work.
+ # Due to a past clone ioctl bug which allowed cloning the inline extent,
+ # these operations resulted in EIO errors.
+ echo "File foo2 data after clone operation:"
+ # All bytes should have the value 0x00 (clone operation failed and did
+ # not modify our file).
+ od -t x1 $SCRATCH_MNT/foo2
+ $XFS_IO_PROG -c "pwrite -S 0xee 0 90" $SCRATCH_MNT/foo2 | _filter_xfs_io
+
+ # Test cloning the inline extent against a file which has a size of zero
+ # but has a prealloc extent. It should not be possible as well to clone
+ # the inline extent from file bar into this file.
+ $XFS_IO_PROG -f -c "falloc -k 0 1M" $SCRATCH_MNT/foo3 | _filter_xfs_io
+ $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo3
+
+ # Doing IO against any range in the first 4K of the file should work.
+ # Due to a past clone ioctl bug which allowed cloning the inline extent,
+ # these operations resulted in EIO errors.
+ echo "First 50 bytes of foo3 after clone operation:"
+ # Should not be able to read any bytes, file has 0 bytes i_size (the
+ # clone operation failed and did not modify our file).
+ od -t x1 $SCRATCH_MNT/foo3
+ $XFS_IO_PROG -c "pwrite -S 0xff 0 90" $SCRATCH_MNT/foo3 | _filter_xfs_io
+
+ # Test cloning the inline extent against a file which consists of a
+ # single inline extent that has a size not greater than the size of
+ # bar's inline extent (40 < 50).
+ # It should be possible to do the extent cloning from bar to this file.
+ $XFS_IO_PROG -f -c "pwrite -S 0x01 0 40" $SCRATCH_MNT/foo4 \
+ | _filter_xfs_io
+ $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo4
+
+ # Doing IO against any range in the first 4K of the file should work.
+ echo "File foo4 data after clone operation:"
+ # Must match file bar's content.
+ od -t x1 $SCRATCH_MNT/foo4
+ $XFS_IO_PROG -c "pwrite -S 0x02 0 90" $SCRATCH_MNT/foo4 | _filter_xfs_io
+
+ # Test cloning the inline extent against a file which consists of a
+ # single inline extent that has a size greater than the size of bar's
+ # inline extent (60 > 50).
+ # It should not be possible to clone the inline extent from file bar
+ # into this file.
+ $XFS_IO_PROG -f -c "pwrite -S 0x03 0 60" $SCRATCH_MNT/foo5 \
+ | _filter_xfs_io
+ $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo5
+
+ # Reading the file should not fail.
+ echo "File foo5 data after clone operation:"
+ # Must have a size of 60 bytes, with all bytes having a value of 0x03
+ # (the clone operation failed and did not modify our file).
+ od -t x1 $SCRATCH_MNT/foo5
+
+ # Test cloning the inline extent against a file which has no extents but
+ # has a size greater than bar's inline extent (16K > 50).
+ # It should not be possible to clone the inline extent from file bar
+ # into this file.
+ $XFS_IO_PROG -f -c "truncate 16K" $SCRATCH_MNT/foo6 | _filter_xfs_io
+ $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo6
+
+ # Reading the file should not fail.
+ echo "File foo6 data after clone operation:"
+ # Must have a size of 16K, with all bytes having a value of 0x00 (the
+ # clone operation failed and did not modify our file).
+ od -t x1 $SCRATCH_MNT/foo6
+
+ # Test cloning the inline extent against a file which has no extents but
+ # has a size not greater than bar's inline extent (30 < 50).
+ # It should be possible to clone the inline extent from file bar into
+ # this file.
+ $XFS_IO_PROG -f -c "truncate 30" $SCRATCH_MNT/foo7 | _filter_xfs_io
+ $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo7
+
+ # Reading the file should not fail.
+ echo "File foo7 data after clone operation:"
+ # Must have a size of 50 bytes, with all bytes having a value of 0xbb.
+ od -t x1 $SCRATCH_MNT/foo7
+
+ # Test cloning the inline extent against a file which has a size not
+ # greater than the size of bar's inline extent (20 < 50) but has
+ # a prealloc extent that goes beyond the file's size. It should not be
+ # possible to clone the inline extent from bar into this file.
+ $XFS_IO_PROG -f -c "falloc -k 0 1M" \
+ -c "pwrite -S 0x88 0 20" \
+ $SCRATCH_MNT/foo8 | _filter_xfs_io
+ $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo8
+
+ echo "File foo8 data after clone operation:"
+ # Must have a size of 20 bytes, with all bytes having a value of 0x88
+ # (the clone operation did not modify our file).
+ od -t x1 $SCRATCH_MNT/foo8
+
+ _scratch_unmount
+ }
+
+ echo -e "\nTesting without compression and without the no-holes feature...\n"
+ test_cloning_inline_extents
+
+ echo -e "\nTesting with compression and without the no-holes feature...\n"
+ test_cloning_inline_extents "" "-o compress"
+
+ echo -e "\nTesting without compression and with the no-holes feature...\n"
+ test_cloning_inline_extents "-O no-holes" ""
+
+ echo -e "\nTesting with compression and with the no-holes feature...\n"
+ test_cloning_inline_extents "-O no-holes" "-o compress"
+
+ status=0
+ exit
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/ioctl.c | 195 ++++++++++++++++++++++++++++++++++++++++++-------------
+ 1 file changed, 152 insertions(+), 43 deletions(-)
+
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -3166,6 +3166,150 @@ static void clone_update_extent_map(stru
+ &BTRFS_I(inode)->runtime_flags);
+ }
+
++/*
++ * Make sure we do not end up inserting an inline extent into a file that has
++ * already other (non-inline) extents. If a file has an inline extent it can
++ * not have any other extents and the (single) inline extent must start at the
++ * file offset 0. Failing to respect these rules will lead to file corruption,
++ * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
++ *
++ * We can have extents that have been already written to disk or we can have
++ * dirty ranges still in delalloc, in which case the extent maps and items are
++ * created only when we run delalloc, and the delalloc ranges might fall outside
++ * the range we are currently locking in the inode's io tree. So we check the
++ * inode's i_size because of that (i_size updates are done while holding the
++ * i_mutex, which we are holding here).
++ * We also check to see if the inode has a size not greater than "datal" but has
++ * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
++ * protected against such concurrent fallocate calls by the i_mutex).
++ *
++ * If the file has no extents but a size greater than datal, do not allow the
++ * copy because we would need turn the inline extent into a non-inline one (even
++ * with NO_HOLES enabled). If we find our destination inode only has one inline
++ * extent, just overwrite it with the source inline extent if its size is less
++ * than the source extent's size, or we could copy the source inline extent's
++ * data into the destination inode's inline extent if the later is greater then
++ * the former.
++ */
++static int clone_copy_inline_extent(struct inode *src,
++ struct inode *dst,
++ struct btrfs_trans_handle *trans,
++ struct btrfs_path *path,
++ struct btrfs_key *new_key,
++ const u64 drop_start,
++ const u64 datal,
++ const u64 skip,
++ const u64 size,
++ char *inline_data)
++{
++ struct btrfs_root *root = BTRFS_I(dst)->root;
++ const u64 aligned_end = ALIGN(new_key->offset + datal,
++ root->sectorsize);
++ int ret;
++ struct btrfs_key key;
++
++ if (new_key->offset > 0)
++ return -EOPNOTSUPP;
++
++ key.objectid = btrfs_ino(dst);
++ key.type = BTRFS_EXTENT_DATA_KEY;
++ key.offset = 0;
++ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
++ if (ret < 0) {
++ return ret;
++ } else if (ret > 0) {
++ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
++ ret = btrfs_next_leaf(root, path);
++ if (ret < 0)
++ return ret;
++ else if (ret > 0)
++ goto copy_inline_extent;
++ }
++ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
++ if (key.objectid == btrfs_ino(dst) &&
++ key.type == BTRFS_EXTENT_DATA_KEY) {
++ ASSERT(key.offset > 0);
++ return -EOPNOTSUPP;
++ }
++ } else if (i_size_read(dst) <= datal) {
++ struct btrfs_file_extent_item *ei;
++ u64 ext_len;
++
++ /*
++ * If the file size is <= datal, make sure there are no other
++ * extents following (can happen do to an fallocate call with
++ * the flag FALLOC_FL_KEEP_SIZE).
++ */
++ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
++ struct btrfs_file_extent_item);
++ /*
++ * If it's an inline extent, it can not have other extents
++ * following it.
++ */
++ if (btrfs_file_extent_type(path->nodes[0], ei) ==
++ BTRFS_FILE_EXTENT_INLINE)
++ goto copy_inline_extent;
++
++ ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
++ if (ext_len > aligned_end)
++ return -EOPNOTSUPP;
++
++ ret = btrfs_next_item(root, path);
++ if (ret < 0) {
++ return ret;
++ } else if (ret == 0) {
++ btrfs_item_key_to_cpu(path->nodes[0], &key,
++ path->slots[0]);
++ if (key.objectid == btrfs_ino(dst) &&
++ key.type == BTRFS_EXTENT_DATA_KEY)
++ return -EOPNOTSUPP;
++ }
++ }
++
++copy_inline_extent:
++ /*
++ * We have no extent items, or we have an extent at offset 0 which may
++ * or may not be inlined. All these cases are dealt the same way.
++ */
++ if (i_size_read(dst) > datal) {
++ /*
++ * If the destination inode has an inline extent...
++ * This would require copying the data from the source inline
++ * extent into the beginning of the destination's inline extent.
++ * But this is really complex, both extents can be compressed
++ * or just one of them, which would require decompressing and
++ * re-compressing data (which could increase the new compressed
++ * size, not allowing the compressed data to fit anymore in an
++ * inline extent).
++ * So just don't support this case for now (it should be rare,
++ * we are not really saving space when cloning inline extents).
++ */
++ return -EOPNOTSUPP;
++ }
++
++ btrfs_release_path(path);
++ ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
++ if (ret)
++ return ret;
++ ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
++ if (ret)
++ return ret;
++
++ if (skip) {
++ const u32 start = btrfs_file_extent_calc_inline_size(0);
++
++ memmove(inline_data + start, inline_data + start + skip, datal);
++ }
++
++ write_extent_buffer(path->nodes[0], inline_data,
++ btrfs_item_ptr_offset(path->nodes[0],
++ path->slots[0]),
++ size);
++ inode_add_bytes(dst, datal);
++
++ return 0;
++}
++
+ /**
+ * btrfs_clone() - clone a range from inode file to another
+ *
+@@ -3432,21 +3576,6 @@ process_slot:
+ } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+ u64 skip = 0;
+ u64 trim = 0;
+- u64 aligned_end = 0;
+-
+- /*
+- * Don't copy an inline extent into an offset
+- * greater than zero. Having an inline extent
+- * at such an offset results in chaos as btrfs
+- * isn't prepared for such cases. Just skip
+- * this case for the same reasons as commented
+- * at btrfs_ioctl_clone().
+- */
+- if (last_dest_end > 0) {
+- ret = -EOPNOTSUPP;
+- btrfs_end_transaction(trans, root);
+- goto out;
+- }
+
+ if (off > key.offset) {
+ skip = off - key.offset;
+@@ -3464,42 +3593,22 @@ process_slot:
+ size -= skip + trim;
+ datal -= skip + trim;
+
+- aligned_end = ALIGN(new_key.offset + datal,
+- root->sectorsize);
+- ret = btrfs_drop_extents(trans, root, inode,
+- drop_start,
+- aligned_end,
+- 1);
++ ret = clone_copy_inline_extent(src, inode,
++ trans, path,
++ &new_key,
++ drop_start,
++ datal,
++ skip, size, buf);
+ if (ret) {
+ if (ret != -EOPNOTSUPP)
+ btrfs_abort_transaction(trans,
+- root, ret);
+- btrfs_end_transaction(trans, root);
+- goto out;
+- }
+-
+- ret = btrfs_insert_empty_item(trans, root, path,
+- &new_key, size);
+- if (ret) {
+- btrfs_abort_transaction(trans, root,
+- ret);
++ root,
++ ret);
+ btrfs_end_transaction(trans, root);
+ goto out;
+ }
+-
+- if (skip) {
+- u32 start =
+- btrfs_file_extent_calc_inline_size(0);
+- memmove(buf+start, buf+start+skip,
+- datal);
+- }
+-
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+- write_extent_buffer(leaf, buf,
+- btrfs_item_ptr_offset(leaf, slot),
+- size);
+- inode_add_bytes(inode, datal);
+ }
+
+ /* If we have an implicit hole (NO_HOLES feature). */
--- /dev/null
+From 1d512cb77bdbda80f0dd0620a3b260d697fd581d Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 9 Nov 2015 00:33:58 +0000
+Subject: Btrfs: fix race leading to BUG_ON when running delalloc for nodatacow
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 1d512cb77bdbda80f0dd0620a3b260d697fd581d upstream.
+
+If we are using the NO_HOLES feature, we have a tiny time window when
+running delalloc for a nodatacow inode where we can race with a concurrent
+link or xattr add operation leading to a BUG_ON.
+
+This happens because at run_delalloc_nocow() we end up casting a leaf item
+of type BTRFS_INODE_[REF|EXTREF]_KEY or of type BTRFS_XATTR_ITEM_KEY to a
+file extent item (struct btrfs_file_extent_item) and then analyse its
+extent type field, which won't match any of the expected extent types
+(values BTRFS_FILE_EXTENT_[REG|PREALLOC|INLINE]) and therefore trigger an
+explicit BUG_ON(1).
+
+The following sequence diagram shows how the race happens when running a
+no-cow dellaloc range [4K, 8K[ for inode 257 and we have the following
+neighbour leafs:
+
+ Leaf X (has N items) Leaf Y
+
+ [ ... (257 INODE_ITEM 0) (257 INODE_REF 256) ] [ (257 EXTENT_DATA 8192), ... ]
+ slot N - 2 slot N - 1 slot 0
+
+ (Note the implicit hole for inode 257 regarding the [0, 8K[ range)
+
+ CPU 1 CPU 2
+
+ run_dealloc_nocow()
+ btrfs_lookup_file_extent()
+ --> searches for a key with value
+ (257 EXTENT_DATA 4096) in the
+ fs/subvol tree
+ --> returns us a path with
+ path->nodes[0] == leaf X and
+ path->slots[0] == N
+
+ because path->slots[0] is >=
+ btrfs_header_nritems(leaf X), it
+ calls btrfs_next_leaf()
+
+ btrfs_next_leaf()
+ --> releases the path
+
+ hard link added to our inode,
+ with key (257 INODE_REF 500)
+ added to the end of leaf X,
+ so leaf X now has N + 1 keys
+
+ --> searches for the key
+ (257 INODE_REF 256), because
+ it was the last key in leaf X
+ before it released the path,
+ with path->keep_locks set to 1
+
+ --> ends up at leaf X again and
+ it verifies that the key
+ (257 INODE_REF 256) is no longer
+ the last key in the leaf, so it
+ returns with path->nodes[0] ==
+ leaf X and path->slots[0] == N,
+ pointing to the new item with
+ key (257 INODE_REF 500)
+
+ the loop iteration of run_dealloc_nocow()
+ does not break out the loop and continues
+ because the key referenced in the path
+ at path->nodes[0] and path->slots[0] is
+ for inode 257, its type is < BTRFS_EXTENT_DATA_KEY
+ and its offset (500) is less then our delalloc
+ range's end (8192)
+
+ the item pointed by the path, an inode reference item,
+ is (incorrectly) interpreted as a file extent item and
+ we get an invalid extent type, leading to the BUG_ON(1):
+
+ if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ (...)
+ } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ (...)
+ } else {
+ BUG_ON(1)
+ }
+
+The same can happen if a xattr is added concurrently and ends up having
+a key with an offset smaller then the delalloc's range end.
+
+So fix this by skipping keys with a type smaller than
+BTRFS_EXTENT_DATA_KEY.
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/inode.c | 10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -1294,8 +1294,14 @@ next_slot:
+ num_bytes = 0;
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+- if (found_key.objectid > ino ||
+- found_key.type > BTRFS_EXTENT_DATA_KEY ||
++ if (found_key.objectid > ino)
++ break;
++ if (WARN_ON_ONCE(found_key.objectid < ino) ||
++ found_key.type < BTRFS_EXTENT_DATA_KEY) {
++ path->slots[0]++;
++ goto next_slot;
++ }
++ if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
+ found_key.offset > end)
+ break;
+
--- /dev/null
+From aeafbf8486c9e2bd53f5cc3c10c0b7fd7149d69c Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Fri, 6 Nov 2015 13:33:33 +0000
+Subject: Btrfs: fix race leading to incorrect item deletion when dropping extents
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit aeafbf8486c9e2bd53f5cc3c10c0b7fd7149d69c upstream.
+
+While running a stress test I got the following warning triggered:
+
+ [191627.672810] ------------[ cut here ]------------
+ [191627.673949] WARNING: CPU: 8 PID: 8447 at fs/btrfs/file.c:779 __btrfs_drop_extents+0x391/0xa50 [btrfs]()
+ (...)
+ [191627.701485] Call Trace:
+ [191627.702037] [<ffffffff8145f077>] dump_stack+0x4f/0x7b
+ [191627.702992] [<ffffffff81095de5>] ? console_unlock+0x356/0x3a2
+ [191627.704091] [<ffffffff8104b3b0>] warn_slowpath_common+0xa1/0xbb
+ [191627.705380] [<ffffffffa0664499>] ? __btrfs_drop_extents+0x391/0xa50 [btrfs]
+ [191627.706637] [<ffffffff8104b46d>] warn_slowpath_null+0x1a/0x1c
+ [191627.707789] [<ffffffffa0664499>] __btrfs_drop_extents+0x391/0xa50 [btrfs]
+ [191627.709155] [<ffffffff8115663c>] ? cache_alloc_debugcheck_after.isra.32+0x171/0x1d0
+ [191627.712444] [<ffffffff81155007>] ? kmemleak_alloc_recursive.constprop.40+0x16/0x18
+ [191627.714162] [<ffffffffa06570c9>] insert_reserved_file_extent.constprop.40+0x83/0x24e [btrfs]
+ [191627.715887] [<ffffffffa065422b>] ? start_transaction+0x3bb/0x610 [btrfs]
+ [191627.717287] [<ffffffffa065b604>] btrfs_finish_ordered_io+0x273/0x4e2 [btrfs]
+ [191627.728865] [<ffffffffa065b888>] finish_ordered_fn+0x15/0x17 [btrfs]
+ [191627.730045] [<ffffffffa067d688>] normal_work_helper+0x14c/0x32c [btrfs]
+ [191627.731256] [<ffffffffa067d96a>] btrfs_endio_write_helper+0x12/0x14 [btrfs]
+ [191627.732661] [<ffffffff81061119>] process_one_work+0x24c/0x4ae
+ [191627.733822] [<ffffffff810615b0>] worker_thread+0x206/0x2c2
+ [191627.734857] [<ffffffff810613aa>] ? process_scheduled_works+0x2f/0x2f
+ [191627.736052] [<ffffffff810613aa>] ? process_scheduled_works+0x2f/0x2f
+ [191627.737349] [<ffffffff810669a6>] kthread+0xef/0xf7
+ [191627.738267] [<ffffffff810f3b3a>] ? time_hardirqs_on+0x15/0x28
+ [191627.739330] [<ffffffff810668b7>] ? __kthread_parkme+0xad/0xad
+ [191627.741976] [<ffffffff81465592>] ret_from_fork+0x42/0x70
+ [191627.743080] [<ffffffff810668b7>] ? __kthread_parkme+0xad/0xad
+ [191627.744206] ---[ end trace bbfddacb7aaada8d ]---
+
+ $ cat -n fs/btrfs/file.c
+ 691 int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
+ (...)
+ 758 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ 759 if (key.objectid > ino ||
+ 760 key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
+ 761 break;
+ 762
+ 763 fi = btrfs_item_ptr(leaf, path->slots[0],
+ 764 struct btrfs_file_extent_item);
+ 765 extent_type = btrfs_file_extent_type(leaf, fi);
+ 766
+ 767 if (extent_type == BTRFS_FILE_EXTENT_REG ||
+ 768 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+ (...)
+ 774 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+ (...)
+ 778 } else {
+ 779 WARN_ON(1);
+ 780 extent_end = search_start;
+ 781 }
+ (...)
+
+This happened because the item we were processing did not match a file
+extent item (its key type != BTRFS_EXTENT_DATA_KEY), and even on this
+case we cast the item to a struct btrfs_file_extent_item pointer and
+then find a type field value that does not match any of the expected
+values (BTRFS_FILE_EXTENT_[REG|PREALLOC|INLINE]). This scenario happens
+due to a tiny time window where a race can happen as exemplified below.
+For example, consider the following scenario where we're using the
+NO_HOLES feature and we have the following two neighbour leafs:
+
+ Leaf X (has N items) Leaf Y
+
+[ ... (257 INODE_ITEM 0) (257 INODE_REF 256) ] [ (257 EXTENT_DATA 8192), ... ]
+ slot N - 2 slot N - 1 slot 0
+
+Our inode 257 has an implicit hole in the range [0, 8K[ (implicit rather
+than explicit because NO_HOLES is enabled). Now if our inode has an
+ordered extent for the range [4K, 8K[ that is finishing, the following
+can happen:
+
+ CPU 1 CPU 2
+
+ btrfs_finish_ordered_io()
+ insert_reserved_file_extent()
+ __btrfs_drop_extents()
+ Searches for the key
+ (257 EXTENT_DATA 4096) through
+ btrfs_lookup_file_extent()
+
+ Key not found and we get a path where
+ path->nodes[0] == leaf X and
+ path->slots[0] == N
+
+ Because path->slots[0] is >=
+ btrfs_header_nritems(leaf X), we call
+ btrfs_next_leaf()
+
+ btrfs_next_leaf() releases the path
+
+ inserts key
+ (257 INODE_REF 4096)
+ at the end of leaf X,
+ leaf X now has N + 1 keys,
+ and the new key is at
+ slot N
+
+ btrfs_next_leaf() searches for
+ key (257 INODE_REF 256), with
+ path->keep_locks set to 1,
+ because it was the last key it
+ saw in leaf X
+
+ finds it in leaf X again and
+ notices it's no longer the last
+ key of the leaf, so it returns 0
+ with path->nodes[0] == leaf X and
+ path->slots[0] == N (which is now
+ < btrfs_header_nritems(leaf X)),
+ pointing to the new key
+ (257 INODE_REF 4096)
+
+ __btrfs_drop_extents() casts the
+ item at path->nodes[0], slot
+ path->slots[0], to a struct
+ btrfs_file_extent_item - it does
+ not skip keys for the target
+ inode with a type less than
+ BTRFS_EXTENT_DATA_KEY
+ (BTRFS_INODE_REF_KEY < BTRFS_EXTENT_DATA_KEY)
+
+ sees a bogus value for the type
+ field triggering the WARN_ON in
+ the trace shown above, and sets
+ extent_end = search_start (4096)
+
+ does the if-then-else logic to
+ fixup 0 length extent items created
+ by a past bug from hole punching:
+
+ if (extent_end == key.offset &&
+ extent_end >= search_start)
+ goto delete_extent_item;
+
+ that evaluates to true and it ends
+ up deleting the key pointed to by
+ path->slots[0], (257 INODE_REF 4096),
+ from leaf X
+
+The same could happen for example for a xattr that ends up having a key
+with an offset value that matches search_start (very unlikely but not
+impossible).
+
+So fix this by ensuring that keys smaller than BTRFS_EXTENT_DATA_KEY are
+skipped, never casted to struct btrfs_file_extent_item and never deleted
+by accident. Also protect against the unexpected case of getting a key
+for a lower inode number by skipping that key and issuing a warning.
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/file.c | 16 ++++++++++++----
+ 1 file changed, 12 insertions(+), 4 deletions(-)
+
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -756,8 +756,16 @@ next_slot:
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+- if (key.objectid > ino ||
+- key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
++
++ if (key.objectid > ino)
++ break;
++ if (WARN_ON_ONCE(key.objectid < ino) ||
++ key.type < BTRFS_EXTENT_DATA_KEY) {
++ ASSERT(del_nr == 0);
++ path->slots[0]++;
++ goto next_slot;
++ }
++ if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
+ break;
+
+ fi = btrfs_item_ptr(leaf, path->slots[0],
+@@ -776,8 +784,8 @@ next_slot:
+ btrfs_file_extent_inline_len(leaf,
+ path->slots[0], fi);
+ } else {
+- WARN_ON(1);
+- extent_end = search_start;
++ /* can't happen */
++ BUG();
+ }
+
+ /*
--- /dev/null
+From f1cd1f0b7d1b5d4aaa5711e8f4e4898b0045cb6d Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 9 Nov 2015 18:06:38 +0000
+Subject: Btrfs: fix race when listing an inode's xattrs
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit f1cd1f0b7d1b5d4aaa5711e8f4e4898b0045cb6d upstream.
+
+When listing a inode's xattrs we have a time window where we race against
+a concurrent operation for adding a new hard link for our inode that makes
+us not return any xattr to user space. In order for this to happen, the
+first xattr of our inode needs to be at slot 0 of a leaf and the previous
+leaf must still have room for an inode ref (or extref) item, and this can
+happen because an inode's listxattrs callback does not lock the inode's
+i_mutex (nor does the VFS does it for us), but adding a hard link to an
+inode makes the VFS lock the inode's i_mutex before calling the inode's
+link callback.
+
+If we have the following leafs:
+
+ Leaf X (has N items) Leaf Y
+
+ [ ... (257 INODE_ITEM 0) (257 INODE_REF 256) ] [ (257 XATTR_ITEM 12345), ... ]
+ slot N - 2 slot N - 1 slot 0
+
+The race illustrated by the following sequence diagram is possible:
+
+ CPU 1 CPU 2
+
+ btrfs_listxattr()
+
+ searches for key (257 XATTR_ITEM 0)
+
+ gets path with path->nodes[0] == leaf X
+ and path->slots[0] == N
+
+ because path->slots[0] is >=
+ btrfs_header_nritems(leaf X), it calls
+ btrfs_next_leaf()
+
+ btrfs_next_leaf()
+ releases the path
+
+ adds key (257 INODE_REF 666)
+ to the end of leaf X (slot N),
+ and leaf X now has N + 1 items
+
+ searches for the key (257 INODE_REF 256),
+ with path->keep_locks == 1, because that
+ is the last key it saw in leaf X before
+ releasing the path
+
+ ends up at leaf X again and it verifies
+ that the key (257 INODE_REF 256) is no
+ longer the last key in leaf X, so it
+ returns with path->nodes[0] == leaf X
+ and path->slots[0] == N, pointing to
+ the new item with key (257 INODE_REF 666)
+
+ btrfs_listxattr's loop iteration sees that
+ the type of the key pointed by the path is
+ different from the type BTRFS_XATTR_ITEM_KEY
+ and so it breaks the loop and stops looking
+ for more xattr items
+ --> the application doesn't get any xattr
+ listed for our inode
+
+So fix this by breaking the loop only if the key's type is greater than
+BTRFS_XATTR_ITEM_KEY and skip the current key if its type is smaller.
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/xattr.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/xattr.c
++++ b/fs/btrfs/xattr.c
+@@ -313,8 +313,10 @@ ssize_t btrfs_listxattr(struct dentry *d
+ /* check to make sure this item is what we want */
+ if (found_key.objectid != key.objectid)
+ break;
+- if (found_key.type != BTRFS_XATTR_ITEM_KEY)
++ if (found_key.type > BTRFS_XATTR_ITEM_KEY)
+ break;
++ if (found_key.type < BTRFS_XATTR_ITEM_KEY)
++ goto next;
+
+ di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+ if (verify_dir_item(root, leaf, di))
--- /dev/null
+From 0305cd5f7fca85dae392b9ba85b116896eb7c1c7 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Fri, 16 Oct 2015 12:34:25 +0100
+Subject: Btrfs: fix truncation of compressed and inlined extents
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 0305cd5f7fca85dae392b9ba85b116896eb7c1c7 upstream.
+
+When truncating a file to a smaller size which consists of an inline
+extent that is compressed, we did not discard (or made unusable) the
+data between the new file size and the old file size, wasting metadata
+space and allowing for the truncated data to be leaked and the data
+corruption/loss mentioned below.
+We were also not correctly decrementing the number of bytes used by the
+inode, we were setting it to zero, giving a wrong report for callers of
+the stat(2) syscall. The fsck tool also reported an error about a mismatch
+between the nbytes of the file versus the real space used by the file.
+
+Now because we weren't discarding the truncated region of the file, it
+was possible for a caller of the clone ioctl to actually read the data
+that was truncated, allowing for a security breach without requiring root
+access to the system, using only standard filesystem operations. The
+scenario is the following:
+
+ 1) User A creates a file which consists of an inline and compressed
+ extent with a size of 2000 bytes - the file is not accessible to
+ any other users (no read, write or execution permission for anyone
+ else);
+
+ 2) The user truncates the file to a size of 1000 bytes;
+
+ 3) User A makes the file world readable;
+
+ 4) User B creates a file consisting of an inline extent of 2000 bytes;
+
+ 5) User B issues a clone operation from user A's file into its own
+ file (using a length argument of 0, clone the whole range);
+
+ 6) User B now gets to see the 1000 bytes that user A truncated from
+ its file before it made its file world readbale. User B also lost
+ the bytes in the range [1000, 2000[ bytes from its own file, but
+ that might be ok if his/her intention was reading stale data from
+ user A that was never supposed to be public.
+
+Note that this contrasts with the case where we truncate a file from 2000
+bytes to 1000 bytes and then truncate it back from 1000 to 2000 bytes. In
+this case reading any byte from the range [1000, 2000[ will return a value
+of 0x00, instead of the original data.
+
+This problem exists since the clone ioctl was added and happens both with
+and without my recent data loss and file corruption fixes for the clone
+ioctl (patch "Btrfs: fix file corruption and data loss after cloning
+inline extents").
+
+So fix this by truncating the compressed inline extents as we do for the
+non-compressed case, which involves decompressing, if the data isn't already
+in the page cache, compressing the truncated version of the extent, writing
+the compressed content into the inline extent and then truncate it.
+
+The following test case for fstests reproduces the problem. In order for
+the test to pass both this fix and my previous fix for the clone ioctl
+that forbids cloning a smaller inline extent into a larger one,
+which is titled "Btrfs: fix file corruption and data loss after cloning
+inline extents", are needed. Without that other fix the test fails in a
+different way that does not leak the truncated data, instead part of
+destination file gets replaced with zeroes (because the destination file
+has a larger inline extent than the source).
+
+ seq=`basename $0`
+ seqres=$RESULT_DIR/$seq
+ echo "QA output created by $seq"
+ tmp=/tmp/$$
+ status=1 # failure is the default!
+ trap "_cleanup; exit \$status" 0 1 2 3 15
+
+ _cleanup()
+ {
+ rm -f $tmp.*
+ }
+
+ # get standard environment, filters and checks
+ . ./common/rc
+ . ./common/filter
+
+ # real QA test starts here
+ _need_to_be_root
+ _supported_fs btrfs
+ _supported_os Linux
+ _require_scratch
+ _require_cloner
+
+ rm -f $seqres.full
+
+ _scratch_mkfs >>$seqres.full 2>&1
+ _scratch_mount "-o compress"
+
+ # Create our test files. File foo is going to be the source of a clone operation
+ # and consists of a single inline extent with an uncompressed size of 512 bytes,
+ # while file bar consists of a single inline extent with an uncompressed size of
+ # 256 bytes. For our test's purpose, it's important that file bar has an inline
+ # extent with a size smaller than foo's inline extent.
+ $XFS_IO_PROG -f -c "pwrite -S 0xa1 0 128" \
+ -c "pwrite -S 0x2a 128 384" \
+ $SCRATCH_MNT/foo | _filter_xfs_io
+ $XFS_IO_PROG -f -c "pwrite -S 0xbb 0 256" $SCRATCH_MNT/bar | _filter_xfs_io
+
+ # Now durably persist all metadata and data. We do this to make sure that we get
+ # on disk an inline extent with a size of 512 bytes for file foo.
+ sync
+
+ # Now truncate our file foo to a smaller size. Because it consists of a
+ # compressed and inline extent, btrfs did not shrink the inline extent to the
+ # new size (if the extent was not compressed, btrfs would shrink it to 128
+ # bytes), it only updates the inode's i_size to 128 bytes.
+ $XFS_IO_PROG -c "truncate 128" $SCRATCH_MNT/foo
+
+ # Now clone foo's inline extent into bar.
+ # This clone operation should fail with errno EOPNOTSUPP because the source
+ # file consists only of an inline extent and the file's size is smaller than
+ # the inline extent of the destination (128 bytes < 256 bytes). However the
+ # clone ioctl was not prepared to deal with a file that has a size smaller
+ # than the size of its inline extent (something that happens only for compressed
+ # inline extents), resulting in copying the full inline extent from the source
+ # file into the destination file.
+ #
+ # Note that btrfs' clone operation for inline extents consists of removing the
+ # inline extent from the destination inode and copy the inline extent from the
+ # source inode into the destination inode, meaning that if the destination
+ # inode's inline extent is larger (N bytes) than the source inode's inline
+ # extent (M bytes), some bytes (N - M bytes) will be lost from the destination
+ # file. Btrfs could copy the source inline extent's data into the destination's
+ # inline extent so that we would not lose any data, but that's currently not
+ # done due to the complexity that would be needed to deal with such cases
+ # (specially when one or both extents are compressed), returning EOPNOTSUPP, as
+ # it's normally not a very common case to clone very small files (only case
+ # where we get inline extents) and copying inline extents does not save any
+ # space (unlike for normal, non-inlined extents).
+ $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/foo $SCRATCH_MNT/bar
+
+ # Now because the above clone operation used to succeed, and due to foo's inline
+ # extent not being shinked by the truncate operation, our file bar got the whole
+ # inline extent copied from foo, making us lose the last 128 bytes from bar
+ # which got replaced by the bytes in range [128, 256[ from foo before foo was
+ # truncated - in other words, data loss from bar and being able to read old and
+ # stale data from foo that should not be possible to read anymore through normal
+ # filesystem operations. Contrast with the case where we truncate a file from a
+ # size N to a smaller size M, truncate it back to size N and then read the range
+ # [M, N[, we should always get the value 0x00 for all the bytes in that range.
+
+ # We expected the clone operation to fail with errno EOPNOTSUPP and therefore
+ # not modify our file's bar data/metadata. So its content should be 256 bytes
+ # long with all bytes having the value 0xbb.
+ #
+ # Without the btrfs bug fix, the clone operation succeeded and resulted in
+ # leaking truncated data from foo, the bytes that belonged to its range
+ # [128, 256[, and losing data from bar in that same range. So reading the
+ # file gave us the following content:
+ #
+ # 0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
+ # *
+ # 0000200 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a
+ # *
+ # 0000400
+ echo "File bar's content after the clone operation:"
+ od -t x1 $SCRATCH_MNT/bar
+
+ # Also because the foo's inline extent was not shrunk by the truncate
+ # operation, btrfs' fsck, which is run by the fstests framework everytime a
+ # test completes, failed reporting the following error:
+ #
+ # root 5 inode 257 errors 400, nbytes wrong
+
+ status=0
+ exit
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/inode.c | 82 +++++++++++++++++++++++++++++++++++++++++++++----------
+ 1 file changed, 68 insertions(+), 14 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -4184,6 +4184,47 @@ static int truncate_space_check(struct b
+
+ }
+
++static int truncate_inline_extent(struct inode *inode,
++ struct btrfs_path *path,
++ struct btrfs_key *found_key,
++ const u64 item_end,
++ const u64 new_size)
++{
++ struct extent_buffer *leaf = path->nodes[0];
++ int slot = path->slots[0];
++ struct btrfs_file_extent_item *fi;
++ u32 size = (u32)(new_size - found_key->offset);
++ struct btrfs_root *root = BTRFS_I(inode)->root;
++
++ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
++
++ if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
++ loff_t offset = new_size;
++ loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE);
++
++ /*
++ * Zero out the remaining of the last page of our inline extent,
++ * instead of directly truncating our inline extent here - that
++ * would be much more complex (decompressing all the data, then
++ * compressing the truncated data, which might be bigger than
++ * the size of the inline extent, resize the extent, etc).
++ * We release the path because to get the page we might need to
++ * read the extent item from disk (data not in the page cache).
++ */
++ btrfs_release_path(path);
++ return btrfs_truncate_page(inode, offset, page_end - offset, 0);
++ }
++
++ btrfs_set_file_extent_ram_bytes(leaf, fi, size);
++ size = btrfs_file_extent_calc_inline_size(size);
++ btrfs_truncate_item(root, path, size, 1);
++
++ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
++ inode_sub_bytes(inode, item_end + 1 - new_size);
++
++ return 0;
++}
++
+ /*
+ * this can truncate away extent items, csum items and directory items.
+ * It starts at a high offset and removes keys until it can't find
+@@ -4378,27 +4419,40 @@ search_again:
+ * special encodings
+ */
+ if (!del_item &&
+- btrfs_file_extent_compression(leaf, fi) == 0 &&
+ btrfs_file_extent_encryption(leaf, fi) == 0 &&
+ btrfs_file_extent_other_encoding(leaf, fi) == 0) {
+- u32 size = new_size - found_key.offset;
+-
+- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+- inode_sub_bytes(inode, item_end + 1 -
+- new_size);
+
+ /*
+- * update the ram bytes to properly reflect
+- * the new size of our item
++ * Need to release path in order to truncate a
++ * compressed extent. So delete any accumulated
++ * extent items so far.
+ */
+- btrfs_set_file_extent_ram_bytes(leaf, fi, size);
+- size =
+- btrfs_file_extent_calc_inline_size(size);
+- btrfs_truncate_item(root, path, size, 1);
++ if (btrfs_file_extent_compression(leaf, fi) !=
++ BTRFS_COMPRESS_NONE && pending_del_nr) {
++ err = btrfs_del_items(trans, root, path,
++ pending_del_slot,
++ pending_del_nr);
++ if (err) {
++ btrfs_abort_transaction(trans,
++ root,
++ err);
++ goto error;
++ }
++ pending_del_nr = 0;
++ }
++
++ err = truncate_inline_extent(inode, path,
++ &found_key,
++ item_end,
++ new_size);
++ if (err) {
++ btrfs_abort_transaction(trans,
++ root, err);
++ goto error;
++ }
+ } else if (test_bit(BTRFS_ROOT_REF_COWS,
+ &root->state)) {
+- inode_sub_bytes(inode, item_end + 1 -
+- found_key.offset);
++ inode_sub_bytes(inode, item_end + 1 - new_size);
+ }
+ }
+ delete:
--- /dev/null
+From 777d738a5e58ba3b6f3932ab1543ce93703f4873 Mon Sep 17 00:00:00 2001
+From: Arnd Bergmann <arnd@arndb.de>
+Date: Wed, 30 Sep 2015 15:04:42 +0200
+Subject: ceph: fix message length computation
+
+From: Arnd Bergmann <arnd@arndb.de>
+
+commit 777d738a5e58ba3b6f3932ab1543ce93703f4873 upstream.
+
+create_request_message() computes the maximum length of a message,
+but uses the wrong type for the time stamp: sizeof(struct timespec)
+may be 8 or 16 depending on the architecture, while sizeof(struct
+ceph_timespec) is always 8, and that is what gets put into the
+message.
+
+Found while auditing the uses of timespec for y2038 problems.
+
+Fixes: b8e69066d8af ("ceph: include time stamp in every MDS request")
+Signed-off-by: Arnd Bergmann <arnd@arndb.de>
+Signed-off-by: Yan, Zheng <zyan@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ceph/mds_client.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ceph/mds_client.c
++++ b/fs/ceph/mds_client.c
+@@ -1905,7 +1905,7 @@ static struct ceph_msg *create_request_m
+
+ len = sizeof(*head) +
+ pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
+- sizeof(struct timespec);
++ sizeof(struct ceph_timespec);
+
+ /* calculate (max) length for cap releases */
+ len += sizeof(struct ceph_mds_request_release) *
--- /dev/null
+From 0ee9608c89e81a1ccee52ecb58a7ff040e2522d9 Mon Sep 17 00:00:00 2001
+From: Daniel Borkmann <daniel@iogearbox.net>
+Date: Thu, 5 Nov 2015 00:01:51 +0100
+Subject: debugfs: fix refcount imbalance in start_creating
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+commit 0ee9608c89e81a1ccee52ecb58a7ff040e2522d9 upstream.
+
+In debugfs' start_creating(), we pin the file system to safely access
+its root. When we failed to create a file, we unpin the file system via
+failed_creating() to release the mount count and eventually the reference
+of the vfsmount.
+
+However, when we run into an error during lookup_one_len() when still
+in start_creating(), we only release the parent's mutex but not so the
+reference on the mount. Looks like it was done in the past, but after
+splitting portions of __create_file() into start_creating() and
+end_creating() via 190afd81e4a5 ("debugfs: split the beginning and the
+end of __create_file() off"), this seemed missed. Noticed during code
+review.
+
+Fixes: 190afd81e4a5 ("debugfs: split the beginning and the end of __create_file() off")
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/debugfs/inode.c | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/fs/debugfs/inode.c
++++ b/fs/debugfs/inode.c
+@@ -276,8 +276,12 @@ static struct dentry *start_creating(con
+ dput(dentry);
+ dentry = ERR_PTR(-EEXIST);
+ }
+- if (IS_ERR(dentry))
++
++ if (IS_ERR(dentry)) {
+ mutex_unlock(&d_inode(parent)->i_mutex);
++ simple_release_fs(&debugfs_mount, &debugfs_mount_count);
++ }
++
+ return dentry;
+ }
+
--- /dev/null
+From 937d7b84dca58f2565715f2c8e52f14c3d65fb22 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Fri, 2 Oct 2015 23:54:58 -0400
+Subject: ext4 crypto: fix memory leak in ext4_bio_write_page()
+
+From: Theodore Ts'o <tytso@mit.edu>
+
+commit 937d7b84dca58f2565715f2c8e52f14c3d65fb22 upstream.
+
+There are times when ext4_bio_write_page() is called even though we
+don't actually need to do any I/O. This happens when ext4_writepage()
+gets called by the jbd2 commit path when an inode needs to force its
+pages written out in order to provide data=ordered guarantees --- and
+a page is backed by an unwritten (e.g., uninitialized) block on disk,
+or if delayed allocation means the page's backing store hasn't been
+allocated yet. In that case, we need to skip the call to
+ext4_encrypt_page(), since in addition to wasting CPU, it leads to a
+bounce page and an ext4 crypto context getting leaked.
+
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ext4/page-io.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/fs/ext4/page-io.c
++++ b/fs/ext4/page-io.c
+@@ -426,6 +426,7 @@ int ext4_bio_write_page(struct ext4_io_s
+ struct buffer_head *bh, *head;
+ int ret = 0;
+ int nr_submitted = 0;
++ int nr_to_submit = 0;
+
+ blocksize = 1 << inode->i_blkbits;
+
+@@ -478,11 +479,13 @@ int ext4_bio_write_page(struct ext4_io_s
+ unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+ }
+ set_buffer_async_write(bh);
++ nr_to_submit++;
+ } while ((bh = bh->b_this_page) != head);
+
+ bh = head = page_buffers(page);
+
+- if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
++ if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) &&
++ nr_to_submit) {
+ data_page = ext4_encrypt(inode, page);
+ if (IS_ERR(data_page)) {
+ ret = PTR_ERR(data_page);
--- /dev/null
+From 6934da9238da947628be83635e365df41064b09b Mon Sep 17 00:00:00 2001
+From: Lukas Czerner <lczerner@redhat.com>
+Date: Sat, 17 Oct 2015 22:57:06 -0400
+Subject: ext4: fix potential use after free in __ext4_journal_stop
+
+From: Lukas Czerner <lczerner@redhat.com>
+
+commit 6934da9238da947628be83635e365df41064b09b upstream.
+
+There is a use-after-free possibility in __ext4_journal_stop() in the
+case that we free the handle in the first jbd2_journal_stop() because
+we're referencing handle->h_err afterwards. This was introduced in
+9705acd63b125dee8b15c705216d7186daea4625 and it is wrong. Fix it by
+storing the handle->h_err value beforehand and avoid referencing
+potentially freed handle.
+
+Fixes: 9705acd63b125dee8b15c705216d7186daea4625
+Signed-off-by: Lukas Czerner <lczerner@redhat.com>
+Reviewed-by: Andreas Dilger <adilger@dilger.ca>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ext4/ext4_jbd2.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/fs/ext4/ext4_jbd2.c
++++ b/fs/ext4/ext4_jbd2.c
+@@ -88,13 +88,13 @@ int __ext4_journal_stop(const char *wher
+ return 0;
+ }
+
++ err = handle->h_err;
+ if (!handle->h_transaction) {
+- err = jbd2_journal_stop(handle);
+- return handle->h_err ? handle->h_err : err;
++ rc = jbd2_journal_stop(handle);
++ return err ? err : rc;
+ }
+
+ sb = handle->h_transaction->t_journal->j_private;
+- err = handle->h_err;
+ rc = jbd2_journal_stop(handle);
+
+ if (!err)
--- /dev/null
+From 4327ba52afd03fc4b5afa0ee1d774c9c5b0e85c5 Mon Sep 17 00:00:00 2001
+From: Daeho Jeong <daeho.jeong@samsung.com>
+Date: Sun, 18 Oct 2015 17:02:56 -0400
+Subject: ext4, jbd2: ensure entering into panic after recording an error in superblock
+
+From: Daeho Jeong <daeho.jeong@samsung.com>
+
+commit 4327ba52afd03fc4b5afa0ee1d774c9c5b0e85c5 upstream.
+
+If a EXT4 filesystem utilizes JBD2 journaling and an error occurs, the
+journaling will be aborted first and the error number will be recorded
+into JBD2 superblock and, finally, the system will enter into the
+panic state in "errors=panic" option. But, in the rare case, this
+sequence is little twisted like the below figure and it will happen
+that the system enters into panic state, which means the system reset
+in mobile environment, before completion of recording an error in the
+journal superblock. In this case, e2fsck cannot recognize that the
+filesystem failure occurred in the previous run and the corruption
+wouldn't be fixed.
+
+Task A Task B
+ext4_handle_error()
+-> jbd2_journal_abort()
+ -> __journal_abort_soft()
+ -> __jbd2_journal_abort_hard()
+ | -> journal->j_flags |= JBD2_ABORT;
+ |
+ | __ext4_abort()
+ | -> jbd2_journal_abort()
+ | | -> __journal_abort_soft()
+ | | -> if (journal->j_flags & JBD2_ABORT)
+ | | return;
+ | -> panic()
+ |
+ -> jbd2_journal_update_sb_errno()
+
+Tested-by: Hobin Woo <hobin.woo@samsung.com>
+Signed-off-by: Daeho Jeong <daeho.jeong@samsung.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ext4/super.c | 12 ++++++++++--
+ fs/jbd2/journal.c | 6 +++++-
+ include/linux/jbd2.h | 1 +
+ 3 files changed, 16 insertions(+), 3 deletions(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -396,9 +396,13 @@ static void ext4_handle_error(struct sup
+ smp_wmb();
+ sb->s_flags |= MS_RDONLY;
+ }
+- if (test_opt(sb, ERRORS_PANIC))
++ if (test_opt(sb, ERRORS_PANIC)) {
++ if (EXT4_SB(sb)->s_journal &&
++ !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
++ return;
+ panic("EXT4-fs (device %s): panic forced after error\n",
+ sb->s_id);
++ }
+ }
+
+ #define ext4_error_ratelimit(sb) \
+@@ -587,8 +591,12 @@ void __ext4_abort(struct super_block *sb
+ jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
+ save_error_info(sb, function, line);
+ }
+- if (test_opt(sb, ERRORS_PANIC))
++ if (test_opt(sb, ERRORS_PANIC)) {
++ if (EXT4_SB(sb)->s_journal &&
++ !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
++ return;
+ panic("EXT4-fs panic from previous error\n");
++ }
+ }
+
+ void __ext4_msg(struct super_block *sb,
+--- a/fs/jbd2/journal.c
++++ b/fs/jbd2/journal.c
+@@ -2086,8 +2086,12 @@ static void __journal_abort_soft (journa
+
+ __jbd2_journal_abort_hard(journal);
+
+- if (errno)
++ if (errno) {
+ jbd2_journal_update_sb_errno(journal);
++ write_lock(&journal->j_state_lock);
++ journal->j_flags |= JBD2_REC_ERR;
++ write_unlock(&journal->j_state_lock);
++ }
+ }
+
+ /**
+--- a/include/linux/jbd2.h
++++ b/include/linux/jbd2.h
+@@ -1007,6 +1007,7 @@ struct journal_s
+ #define JBD2_ABORT_ON_SYNCDATA_ERR 0x040 /* Abort the journal on file
+ * data write error in ordered
+ * mode */
++#define JBD2_REC_ERR 0x080 /* The errno in the sb has been recorded */
+
+ /*
+ * Function declarations for the journaling transaction and buffer
--- /dev/null
+From 100ceb66d5c40cc0c7018e06a9474302470be73c Mon Sep 17 00:00:00 2001
+From: Stefan Richter <stefanr@s5r6.in-berlin.de>
+Date: Tue, 3 Nov 2015 01:46:21 +0100
+Subject: firewire: ohci: fix JMicron JMB38x IT context discovery
+
+From: Stefan Richter <stefanr@s5r6.in-berlin.de>
+
+commit 100ceb66d5c40cc0c7018e06a9474302470be73c upstream.
+
+Reported by Clifford and Craig for JMicron OHCI-1394 + SDHCI combo
+controllers: Often or even most of the time, the controller is
+initialized with the message "added OHCI v1.10 device as card 0, 4 IR +
+0 IT contexts, quirks 0x10". With 0 isochronous transmit DMA contexts
+(IT contexts), applications like audio output are impossible.
+
+However, OHCI-1394 demands that at least 4 IT contexts are implemented
+by the link layer controller, and indeed JMicron JMB38x do implement
+four of them. Only their IsoXmitIntMask register is unreliable at early
+access.
+
+With my own JMB381 single function controller I found:
+ - I can reproduce the problem with a lower probability than Craig's.
+ - If I put a loop around the section which clears and reads
+ IsoXmitIntMask, then either the first or the second attempt will
+ return the correct initial mask of 0x0000000f. I never encountered
+ a case of needing more than a second attempt.
+ - Consequently, if I put a dummy reg_read(...IsoXmitIntMaskSet)
+ before the first write, the subsequent read will return the correct
+ result.
+ - If I merely ignore a wrong read result and force the known real
+ result, later isochronous transmit DMA usage works just fine.
+
+So let's just fix this chip bug up by the latter method. Tested with
+JMB381 on kernel 3.13 and 4.3.
+
+Since OHCI-1394 generally requires 4 IT contexts at a minium, this
+workaround is simply applied whenever the initial read of IsoXmitIntMask
+returns 0, regardless whether it's a JMicron chip or not. I never heard
+of this issue together with any other chip though.
+
+I am not 100% sure that this fix works on the OHCI-1394 part of JMB380
+and JMB388 combo controllers exactly the same as on the JMB381 single-
+function controller, but so far I haven't had a chance to let an owner
+of a combo chip run a patched kernel.
+
+Strangely enough, IsoRecvIntMask is always reported correctly, even
+though it is probed right before IsoXmitIntMask.
+
+Reported-by: Clifford Dunn
+Reported-by: Craig Moore <craig.moore@qenos.com>
+Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/firewire/ohci.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/drivers/firewire/ohci.c
++++ b/drivers/firewire/ohci.c
+@@ -3675,6 +3675,11 @@ static int pci_probe(struct pci_dev *dev
+
+ reg_write(ohci, OHCI1394_IsoXmitIntMaskSet, ~0);
+ ohci->it_context_support = reg_read(ohci, OHCI1394_IsoXmitIntMaskSet);
++ /* JMicron JMB38x often shows 0 at first read, just ignore it */
++ if (!ohci->it_context_support) {
++ ohci_notice(ohci, "overriding IsoXmitIntMask\n");
++ ohci->it_context_support = 0xf;
++ }
+ reg_write(ohci, OHCI1394_IsoXmitIntMaskClear, ~0);
+ ohci->it_context_mask = ohci->it_context_support;
+ ohci->n_it = hweight32(ohci->it_context_mask);
--- /dev/null
+From c812012f9ca7cf89c9e1a1cd512e6c3b5be04b85 Mon Sep 17 00:00:00 2001
+From: Jeff Layton <jlayton@poochiereds.net>
+Date: Wed, 25 Nov 2015 13:50:11 -0500
+Subject: nfs: if we have no valid attrs, then don't declare the attribute cache valid
+
+From: Jeff Layton <jlayton@poochiereds.net>
+
+commit c812012f9ca7cf89c9e1a1cd512e6c3b5be04b85 upstream.
+
+If we pass in an empty nfs_fattr struct to nfs_update_inode, it will
+(correctly) not update any of the attributes, but it then clears the
+NFS_INO_INVALID_ATTR flag, which indicates that the attributes are
+up to date. Don't clear the flag if the fattr struct has no valid
+attrs to apply.
+
+Reviewed-by: Steve French <steve.french@primarydata.com>
+Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
+Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/nfs/inode.c | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/fs/nfs/inode.c
++++ b/fs/nfs/inode.c
+@@ -1813,7 +1813,11 @@ static int nfs_update_inode(struct inode
+ if ((long)fattr->gencount - (long)nfsi->attr_gencount > 0)
+ nfsi->attr_gencount = fattr->gencount;
+ }
+- invalid &= ~NFS_INO_INVALID_ATTR;
++
++ /* Don't declare attrcache up to date if there were no attrs! */
++ if (fattr->valid != 0)
++ invalid &= ~NFS_INO_INVALID_ATTR;
++
+ /* Don't invalidate the data if we were to blame */
+ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
+ || S_ISLNK(inode->i_mode)))
--- /dev/null
+From c68a027c05709330fe5b2f50c50d5fa02124b5d8 Mon Sep 17 00:00:00 2001
+From: Benjamin Coddington <bcodding@redhat.com>
+Date: Fri, 20 Nov 2015 09:56:20 -0500
+Subject: nfs4: start callback_ident at idr 1
+
+From: Benjamin Coddington <bcodding@redhat.com>
+
+commit c68a027c05709330fe5b2f50c50d5fa02124b5d8 upstream.
+
+If clp->cl_cb_ident is zero, then nfs_cb_idr_remove_locked() skips removing
+it when the nfs_client is freed. A decoding or server bug can then find
+and try to put that first nfs_client which would lead to a crash.
+
+Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
+Fixes: d6870312659d ("nfs4client: convert to idr_alloc()")
+Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/nfs/nfs4client.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/nfs/nfs4client.c
++++ b/fs/nfs/nfs4client.c
+@@ -33,7 +33,7 @@ static int nfs_get_cb_ident_idr(struct n
+ return ret;
+ idr_preload(GFP_KERNEL);
+ spin_lock(&nn->nfs_client_lock);
+- ret = idr_alloc(&nn->cb_ident_idr, clp, 0, 0, GFP_NOWAIT);
++ ret = idr_alloc(&nn->cb_ident_idr, clp, 1, 0, GFP_NOWAIT);
+ if (ret >= 0)
+ clp->cl_cb_ident = ret;
+ spin_unlock(&nn->nfs_client_lock);
--- /dev/null
+From 34ed9872e745fa56f10e9bef2cf3d2336c6c8816 Mon Sep 17 00:00:00 2001
+From: Andrew Elble <aweits@rit.edu>
+Date: Thu, 15 Oct 2015 12:07:28 -0400
+Subject: nfsd: eliminate sending duplicate and repeated delegations
+
+From: Andrew Elble <aweits@rit.edu>
+
+commit 34ed9872e745fa56f10e9bef2cf3d2336c6c8816 upstream.
+
+We've observed the nfsd server in a state where there are
+multiple delegations on the same nfs4_file for the same client.
+The nfs client does attempt to DELEGRETURN these when they are presented to
+it - but apparently under some (unknown) circumstances the client does not
+manage to return all of them. This leads to the eventual
+attempt to CB_RECALL more than one delegation with the same nfs
+filehandle to the same client. The first recall will succeed, but the
+next recall will fail with NFS4ERR_BADHANDLE. This leads to the server
+having delegations on cl_revoked that the client has no way to FREE
+or DELEGRETURN, with resulting inability to recover. The state manager
+on the server will continually assert SEQ4_STATUS_RECALLABLE_STATE_REVOKED,
+and the state manager on the client will be looping unable to satisfy
+the server.
+
+List discussion also reports a race between OPEN and DELEGRETURN that
+will be avoided by only sending the delegation once to the
+client. This is also logically in accordance with RFC5561 9.1.1 and 10.2.
+
+So, let's:
+
+1.) Not hand out duplicate delegations.
+2.) Only send them to the client once.
+
+RFC 5561:
+
+9.1.1:
+"Delegations and layouts, on the other hand, are not associated with a
+specific owner but are associated with the client as a whole
+(identified by a client ID)."
+
+10.2:
+"...the stateid for a delegation is associated with a client ID and may be
+used on behalf of all the open-owners for the given client. A
+delegation is made to the client as a whole and not to any specific
+process or thread of control within it."
+
+Reported-by: Eric Meddaugh <etmsys@rit.edu>
+Cc: Trond Myklebust <trond.myklebust@primarydata.com>
+Cc: Olga Kornievskaia <aglo@umich.edu>
+Signed-off-by: Andrew Elble <aweits@rit.edu>
+Signed-off-by: J. Bruce Fields <bfields@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/nfsd/nfs4state.c | 94 ++++++++++++++++++++++++++++++++++++++++++++++------
+ 1 file changed, 84 insertions(+), 10 deletions(-)
+
+--- a/fs/nfsd/nfs4state.c
++++ b/fs/nfsd/nfs4state.c
+@@ -765,16 +765,68 @@ void nfs4_unhash_stid(struct nfs4_stid *
+ s->sc_type = 0;
+ }
+
+-static void
++/**
++ * nfs4_get_existing_delegation - Discover if this delegation already exists
++ * @clp: a pointer to the nfs4_client we're granting a delegation to
++ * @fp: a pointer to the nfs4_file we're granting a delegation on
++ *
++ * Return:
++ * On success: NULL if an existing delegation was not found.
++ *
++ * On error: -EAGAIN if one was previously granted to this nfs4_client
++ * for this nfs4_file.
++ *
++ */
++
++static int
++nfs4_get_existing_delegation(struct nfs4_client *clp, struct nfs4_file *fp)
++{
++ struct nfs4_delegation *searchdp = NULL;
++ struct nfs4_client *searchclp = NULL;
++
++ lockdep_assert_held(&state_lock);
++ lockdep_assert_held(&fp->fi_lock);
++
++ list_for_each_entry(searchdp, &fp->fi_delegations, dl_perfile) {
++ searchclp = searchdp->dl_stid.sc_client;
++ if (clp == searchclp) {
++ return -EAGAIN;
++ }
++ }
++ return 0;
++}
++
++/**
++ * hash_delegation_locked - Add a delegation to the appropriate lists
++ * @dp: a pointer to the nfs4_delegation we are adding.
++ * @fp: a pointer to the nfs4_file we're granting a delegation on
++ *
++ * Return:
++ * On success: NULL if the delegation was successfully hashed.
++ *
++ * On error: -EAGAIN if one was previously granted to this
++ * nfs4_client for this nfs4_file. Delegation is not hashed.
++ *
++ */
++
++static int
+ hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
+ {
++ int status;
++ struct nfs4_client *clp = dp->dl_stid.sc_client;
++
+ lockdep_assert_held(&state_lock);
+ lockdep_assert_held(&fp->fi_lock);
+
++ status = nfs4_get_existing_delegation(clp, fp);
++ if (status)
++ return status;
++ ++fp->fi_delegees;
+ atomic_inc(&dp->dl_stid.sc_count);
+ dp->dl_stid.sc_type = NFS4_DELEG_STID;
+ list_add(&dp->dl_perfile, &fp->fi_delegations);
+- list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
++ list_add(&dp->dl_perclnt, &clp->cl_delegations);
++ return 0;
+ }
+
+ static bool
+@@ -3941,6 +3993,18 @@ static struct file_lock *nfs4_alloc_init
+ return fl;
+ }
+
++/**
++ * nfs4_setlease - Obtain a delegation by requesting lease from vfs layer
++ * @dp: a pointer to the nfs4_delegation we're adding.
++ *
++ * Return:
++ * On success: Return code will be 0 on success.
++ *
++ * On error: -EAGAIN if there was an existing delegation.
++ * nonzero if there is an error in other cases.
++ *
++ */
++
+ static int nfs4_setlease(struct nfs4_delegation *dp)
+ {
+ struct nfs4_file *fp = dp->dl_stid.sc_file;
+@@ -3972,16 +4036,19 @@ static int nfs4_setlease(struct nfs4_del
+ goto out_unlock;
+ /* Race breaker */
+ if (fp->fi_deleg_file) {
+- status = 0;
+- ++fp->fi_delegees;
+- hash_delegation_locked(dp, fp);
++ status = hash_delegation_locked(dp, fp);
+ goto out_unlock;
+ }
+ fp->fi_deleg_file = filp;
+- fp->fi_delegees = 1;
+- hash_delegation_locked(dp, fp);
++ fp->fi_delegees = 0;
++ status = hash_delegation_locked(dp, fp);
+ spin_unlock(&fp->fi_lock);
+ spin_unlock(&state_lock);
++ if (status) {
++ /* Should never happen, this is a new fi_deleg_file */
++ WARN_ON_ONCE(1);
++ goto out_fput;
++ }
+ return 0;
+ out_unlock:
+ spin_unlock(&fp->fi_lock);
+@@ -4001,6 +4068,15 @@ nfs4_set_delegation(struct nfs4_client *
+ if (fp->fi_had_conflict)
+ return ERR_PTR(-EAGAIN);
+
++ spin_lock(&state_lock);
++ spin_lock(&fp->fi_lock);
++ status = nfs4_get_existing_delegation(clp, fp);
++ spin_unlock(&fp->fi_lock);
++ spin_unlock(&state_lock);
++
++ if (status)
++ return ERR_PTR(status);
++
+ dp = alloc_init_deleg(clp, fh, odstate);
+ if (!dp)
+ return ERR_PTR(-ENOMEM);
+@@ -4019,9 +4095,7 @@ nfs4_set_delegation(struct nfs4_client *
+ status = -EAGAIN;
+ goto out_unlock;
+ }
+- ++fp->fi_delegees;
+- hash_delegation_locked(dp, fp);
+- status = 0;
++ status = hash_delegation_locked(dp, fp);
+ out_unlock:
+ spin_unlock(&fp->fi_lock);
+ spin_unlock(&state_lock);
--- /dev/null
+From 35a92fe8770ce54c5eb275cd76128645bea2d200 Mon Sep 17 00:00:00 2001
+From: Jeff Layton <jlayton@poochiereds.net>
+Date: Thu, 17 Sep 2015 07:47:08 -0400
+Subject: nfsd: serialize state seqid morphing operations
+
+From: Jeff Layton <jlayton@poochiereds.net>
+
+commit 35a92fe8770ce54c5eb275cd76128645bea2d200 upstream.
+
+Andrew was seeing a race occur when an OPEN and OPEN_DOWNGRADE were
+running in parallel. The server would receive the OPEN_DOWNGRADE first
+and check its seqid, but then an OPEN would race in and bump it. The
+OPEN_DOWNGRADE would then complete and bump the seqid again. The result
+was that the OPEN_DOWNGRADE would be applied after the OPEN, even though
+it should have been rejected since the seqid changed.
+
+The only recourse we have here I think is to serialize operations that
+bump the seqid in a stateid, particularly when we're given a seqid in
+the call. To address this, we add a new rw_semaphore to the
+nfs4_ol_stateid struct. We do a down_write prior to checking the seqid
+after looking up the stateid to ensure that nothing else is going to
+bump it while we're operating on it.
+
+In the case of OPEN, we do a down_read, as the call doesn't contain a
+seqid. Those can run in parallel -- we just need to serialize them when
+there is a concurrent OPEN_DOWNGRADE or CLOSE.
+
+LOCK and LOCKU however always take the write lock as there is no
+opportunity for parallelizing those.
+
+Reported-and-Tested-by: Andrew W Elble <aweits@rit.edu>
+Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
+Signed-off-by: J. Bruce Fields <bfields@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/nfsd/nfs4state.c | 33 ++++++++++++++++++++++++++++-----
+ fs/nfsd/state.h | 19 ++++++++++---------
+ 2 files changed, 38 insertions(+), 14 deletions(-)
+
+--- a/fs/nfsd/nfs4state.c
++++ b/fs/nfsd/nfs4state.c
+@@ -3351,6 +3351,7 @@ static void init_open_stateid(struct nfs
+ stp->st_access_bmap = 0;
+ stp->st_deny_bmap = 0;
+ stp->st_openstp = NULL;
++ init_rwsem(&stp->st_rwsem);
+ spin_lock(&oo->oo_owner.so_client->cl_lock);
+ list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
+ spin_lock(&fp->fi_lock);
+@@ -4181,15 +4182,20 @@ nfsd4_process_open2(struct svc_rqst *rqs
+ */
+ if (stp) {
+ /* Stateid was found, this is an OPEN upgrade */
++ down_read(&stp->st_rwsem);
+ status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
+- if (status)
++ if (status) {
++ up_read(&stp->st_rwsem);
+ goto out;
++ }
+ } else {
+ stp = open->op_stp;
+ open->op_stp = NULL;
+ init_open_stateid(stp, fp, open);
++ down_read(&stp->st_rwsem);
+ status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
+ if (status) {
++ up_read(&stp->st_rwsem);
+ release_open_stateid(stp);
+ goto out;
+ }
+@@ -4201,6 +4207,7 @@ nfsd4_process_open2(struct svc_rqst *rqs
+ }
+ update_stateid(&stp->st_stid.sc_stateid);
+ memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
++ up_read(&stp->st_rwsem);
+
+ if (nfsd4_has_session(&resp->cstate)) {
+ if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {
+@@ -4777,10 +4784,13 @@ static __be32 nfs4_seqid_op_checks(struc
+ * revoked delegations are kept only for free_stateid.
+ */
+ return nfserr_bad_stateid;
++ down_write(&stp->st_rwsem);
+ status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
+- if (status)
+- return status;
+- return nfs4_check_fh(current_fh, &stp->st_stid);
++ if (status == nfs_ok)
++ status = nfs4_check_fh(current_fh, &stp->st_stid);
++ if (status != nfs_ok)
++ up_write(&stp->st_rwsem);
++ return status;
+ }
+
+ /*
+@@ -4827,6 +4837,7 @@ static __be32 nfs4_preprocess_confirmed_
+ return status;
+ oo = openowner(stp->st_stateowner);
+ if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
++ up_write(&stp->st_rwsem);
+ nfs4_put_stid(&stp->st_stid);
+ return nfserr_bad_stateid;
+ }
+@@ -4857,11 +4868,14 @@ nfsd4_open_confirm(struct svc_rqst *rqst
+ goto out;
+ oo = openowner(stp->st_stateowner);
+ status = nfserr_bad_stateid;
+- if (oo->oo_flags & NFS4_OO_CONFIRMED)
++ if (oo->oo_flags & NFS4_OO_CONFIRMED) {
++ up_write(&stp->st_rwsem);
+ goto put_stateid;
++ }
+ oo->oo_flags |= NFS4_OO_CONFIRMED;
+ update_stateid(&stp->st_stid.sc_stateid);
+ memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
++ up_write(&stp->st_rwsem);
+ dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
+ __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
+
+@@ -4940,6 +4954,7 @@ nfsd4_open_downgrade(struct svc_rqst *rq
+ memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+ status = nfs_ok;
+ put_stateid:
++ up_write(&stp->st_rwsem);
+ nfs4_put_stid(&stp->st_stid);
+ out:
+ nfsd4_bump_seqid(cstate, status);
+@@ -4993,6 +5008,7 @@ nfsd4_close(struct svc_rqst *rqstp, stru
+ goto out;
+ update_stateid(&stp->st_stid.sc_stateid);
+ memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
++ up_write(&stp->st_rwsem);
+
+ nfsd4_close_open_stateid(stp);
+
+@@ -5223,6 +5239,7 @@ init_lock_stateid(struct nfs4_ol_stateid
+ stp->st_access_bmap = 0;
+ stp->st_deny_bmap = open_stp->st_deny_bmap;
+ stp->st_openstp = open_stp;
++ init_rwsem(&stp->st_rwsem);
+ list_add(&stp->st_locks, &open_stp->st_locks);
+ list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
+ spin_lock(&fp->fi_lock);
+@@ -5391,6 +5408,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struc
+ &open_stp, nn);
+ if (status)
+ goto out;
++ up_write(&open_stp->st_rwsem);
+ open_sop = openowner(open_stp->st_stateowner);
+ status = nfserr_bad_stateid;
+ if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
+@@ -5398,6 +5416,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struc
+ goto out;
+ status = lookup_or_create_lock_state(cstate, open_stp, lock,
+ &lock_stp, &new);
++ if (status == nfs_ok)
++ down_write(&lock_stp->st_rwsem);
+ } else {
+ status = nfs4_preprocess_seqid_op(cstate,
+ lock->lk_old_lock_seqid,
+@@ -5503,6 +5523,8 @@ out:
+ seqid_mutating_err(ntohl(status)))
+ lock_sop->lo_owner.so_seqid++;
+
++ up_write(&lock_stp->st_rwsem);
++
+ /*
+ * If this is a new, never-before-used stateid, and we are
+ * returning an error, then just go ahead and release it.
+@@ -5673,6 +5695,7 @@ nfsd4_locku(struct svc_rqst *rqstp, stru
+ fput:
+ fput(filp);
+ put_stateid:
++ up_write(&stp->st_rwsem);
+ nfs4_put_stid(&stp->st_stid);
+ out:
+ nfsd4_bump_seqid(cstate, status);
+--- a/fs/nfsd/state.h
++++ b/fs/nfsd/state.h
+@@ -533,15 +533,16 @@ struct nfs4_file {
+ * Better suggestions welcome.
+ */
+ struct nfs4_ol_stateid {
+- struct nfs4_stid st_stid; /* must be first field */
+- struct list_head st_perfile;
+- struct list_head st_perstateowner;
+- struct list_head st_locks;
+- struct nfs4_stateowner * st_stateowner;
+- struct nfs4_clnt_odstate * st_clnt_odstate;
+- unsigned char st_access_bmap;
+- unsigned char st_deny_bmap;
+- struct nfs4_ol_stateid * st_openstp;
++ struct nfs4_stid st_stid;
++ struct list_head st_perfile;
++ struct list_head st_perstateowner;
++ struct list_head st_locks;
++ struct nfs4_stateowner *st_stateowner;
++ struct nfs4_clnt_odstate *st_clnt_odstate;
++ unsigned char st_access_bmap;
++ unsigned char st_deny_bmap;
++ struct nfs4_ol_stateid *st_openstp;
++ struct rw_semaphore st_rwsem;
+ };
+
+ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
--- /dev/null
+From 8f1eb48758aacf6c1ffce18179295adbf3bd7640 Mon Sep 17 00:00:00 2001
+From: Junxiao Bi <junxiao.bi@oracle.com>
+Date: Fri, 20 Nov 2015 15:57:30 -0800
+Subject: ocfs2: fix umask ignored issue
+
+From: Junxiao Bi <junxiao.bi@oracle.com>
+
+commit 8f1eb48758aacf6c1ffce18179295adbf3bd7640 upstream.
+
+New created file's mode is not masked with umask, and this makes umask not
+work for ocfs2 volume.
+
+Fixes: 702e5bc ("ocfs2: use generic posix ACL infrastructure")
+Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
+Cc: Gang He <ghe@suse.com>
+Cc: Mark Fasheh <mfasheh@suse.de>
+Cc: Joel Becker <jlbec@evilplan.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ocfs2/namei.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/ocfs2/namei.c
++++ b/fs/ocfs2/namei.c
+@@ -365,6 +365,8 @@ static int ocfs2_mknod(struct inode *dir
+ mlog_errno(status);
+ goto leave;
+ }
++ /* update inode->i_mode after mask with "umask". */
++ inode->i_mode = mode;
+
+ handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
+ S_ISDIR(mode),
--- /dev/null
+From 70b16db86f564977df074072143284aec2cb1162 Mon Sep 17 00:00:00 2001
+From: Ilya Dryomov <idryomov@gmail.com>
+Date: Fri, 27 Nov 2015 19:23:24 +0100
+Subject: rbd: don't put snap_context twice in rbd_queue_workfn()
+
+From: Ilya Dryomov <idryomov@gmail.com>
+
+commit 70b16db86f564977df074072143284aec2cb1162 upstream.
+
+Commit 4e752f0ab0e8 ("rbd: access snapshot context and mapping size
+safely") moved ceph_get_snap_context() out of rbd_img_request_create()
+and into rbd_queue_workfn(), adding a ceph_put_snap_context() to the
+error path in rbd_queue_workfn(). However, rbd_img_request_create()
+consumes a ref on snapc, so calling ceph_put_snap_context() after
+a successful rbd_img_request_create() leads to an extra put. Fix it.
+
+Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
+Reviewed-by: Josh Durgin <jdurgin@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/block/rbd.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/block/rbd.c
++++ b/drivers/block/rbd.c
+@@ -3417,6 +3417,7 @@ static void rbd_queue_workfn(struct work
+ goto err_rq;
+ }
+ img_request->rq = rq;
++ snapc = NULL; /* img_request consumes a ref */
+
+ if (op_type == OBJ_OP_DISCARD)
+ result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA,
net-neighbour-fix-crash-at-dumping-device-agnostic-proxy-entries.patch
ipv6-sctp-implement-sctp_v6_destroy_sock.patch
net_sched-fix-qdisc_tree_decrease_qlen-races.patch
+btrfs-check-unsupported-filters-in-balance-arguments.patch
+btrfs-fix-file-corruption-and-data-loss-after-cloning-inline-extents.patch
+btrfs-fix-truncation-of-compressed-and-inlined-extents.patch
+btrfs-fix-race-leading-to-incorrect-item-deletion-when-dropping-extents.patch
+btrfs-fix-race-leading-to-bug_on-when-running-delalloc-for-nodatacow.patch
+btrfs-fix-race-when-listing-an-inode-s-xattrs.patch
+rbd-don-t-put-snap_context-twice-in-rbd_queue_workfn.patch
+ext4-crypto-fix-memory-leak-in-ext4_bio_write_page.patch
+ext4-fix-potential-use-after-free-in-__ext4_journal_stop.patch
+ext4-jbd2-ensure-entering-into-panic-after-recording-an-error-in-superblock.patch
+firewire-ohci-fix-jmicron-jmb38x-it-context-discovery.patch
+nfsd-serialize-state-seqid-morphing-operations.patch
+nfsd-eliminate-sending-duplicate-and-repeated-delegations.patch
+debugfs-fix-refcount-imbalance-in-start_creating.patch
+nfs4-start-callback_ident-at-idr-1.patch
+nfs-if-we-have-no-valid-attrs-then-don-t-declare-the-attribute-cache-valid.patch
+ocfs2-fix-umask-ignored-issue.patch
+ceph-fix-message-length-computation.patch
+alsa-hda-hdmi-apply-skylake-fix-ups-to-broxton-display-codec.patch