]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.3-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 11 Dec 2015 17:20:21 +0000 (09:20 -0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 11 Dec 2015 17:20:21 +0000 (09:20 -0800)
added patches:
alsa-hda-hdmi-apply-skylake-fix-ups-to-broxton-display-codec.patch
alsa-pci-depend-on-zone_dma.patch
block-fix-segment-split.patch
btrfs-check-unsupported-filters-in-balance-arguments.patch
btrfs-fix-file-corruption-and-data-loss-after-cloning-inline-extents.patch
btrfs-fix-race-leading-to-bug_on-when-running-delalloc-for-nodatacow.patch
btrfs-fix-race-leading-to-incorrect-item-deletion-when-dropping-extents.patch
btrfs-fix-race-when-listing-an-inode-s-xattrs.patch
btrfs-fix-regression-when-running-delayed-references.patch
btrfs-fix-resending-received-snapshot-with-parent.patch
btrfs-fix-signed-overflows-in-btrfs_sync_file.patch
btrfs-fix-truncation-of-compressed-and-inlined-extents.patch
ceph-fix-message-length-computation.patch
cobalt-fix-kconfig-dependency.patch
debugfs-fix-refcount-imbalance-in-start_creating.patch
ext4-crypto-fix-bugs-in-ext4_encrypted_zeroout.patch
ext4-crypto-fix-memory-leak-in-ext4_bio_write_page.patch
ext4-crypto-replace-some-bug_on-s-with-error-checks.patch
ext4-fix-potential-use-after-free-in-__ext4_journal_stop.patch
ext4-jbd2-ensure-entering-into-panic-after-recording-an-error-in-superblock.patch
firewire-ohci-fix-jmicron-jmb38x-it-context-discovery.patch
nfs-if-we-have-no-valid-attrs-then-don-t-declare-the-attribute-cache-valid.patch
nfs4-limit-callback-decoding-to-received-bytes.patch
nfs4-resend-layoutget-when-there-is-a-race-that-changes-the-seqid.patch
nfs4-start-callback_ident-at-idr-1.patch
nfsd-eliminate-sending-duplicate-and-repeated-delegations.patch
nfsd-serialize-state-seqid-morphing-operations.patch
ocfs2-fix-umask-ignored-issue.patch
rbd-don-t-put-snap_context-twice-in-rbd_queue_workfn.patch

30 files changed:
queue-4.3/alsa-hda-hdmi-apply-skylake-fix-ups-to-broxton-display-codec.patch [new file with mode: 0644]
queue-4.3/alsa-pci-depend-on-zone_dma.patch [new file with mode: 0644]
queue-4.3/block-fix-segment-split.patch [new file with mode: 0644]
queue-4.3/btrfs-check-unsupported-filters-in-balance-arguments.patch [new file with mode: 0644]
queue-4.3/btrfs-fix-file-corruption-and-data-loss-after-cloning-inline-extents.patch [new file with mode: 0644]
queue-4.3/btrfs-fix-race-leading-to-bug_on-when-running-delalloc-for-nodatacow.patch [new file with mode: 0644]
queue-4.3/btrfs-fix-race-leading-to-incorrect-item-deletion-when-dropping-extents.patch [new file with mode: 0644]
queue-4.3/btrfs-fix-race-when-listing-an-inode-s-xattrs.patch [new file with mode: 0644]
queue-4.3/btrfs-fix-regression-when-running-delayed-references.patch [new file with mode: 0644]
queue-4.3/btrfs-fix-resending-received-snapshot-with-parent.patch [new file with mode: 0644]
queue-4.3/btrfs-fix-signed-overflows-in-btrfs_sync_file.patch [new file with mode: 0644]
queue-4.3/btrfs-fix-truncation-of-compressed-and-inlined-extents.patch [new file with mode: 0644]
queue-4.3/ceph-fix-message-length-computation.patch [new file with mode: 0644]
queue-4.3/cobalt-fix-kconfig-dependency.patch [new file with mode: 0644]
queue-4.3/debugfs-fix-refcount-imbalance-in-start_creating.patch [new file with mode: 0644]
queue-4.3/ext4-crypto-fix-bugs-in-ext4_encrypted_zeroout.patch [new file with mode: 0644]
queue-4.3/ext4-crypto-fix-memory-leak-in-ext4_bio_write_page.patch [new file with mode: 0644]
queue-4.3/ext4-crypto-replace-some-bug_on-s-with-error-checks.patch [new file with mode: 0644]
queue-4.3/ext4-fix-potential-use-after-free-in-__ext4_journal_stop.patch [new file with mode: 0644]
queue-4.3/ext4-jbd2-ensure-entering-into-panic-after-recording-an-error-in-superblock.patch [new file with mode: 0644]
queue-4.3/firewire-ohci-fix-jmicron-jmb38x-it-context-discovery.patch [new file with mode: 0644]
queue-4.3/nfs-if-we-have-no-valid-attrs-then-don-t-declare-the-attribute-cache-valid.patch [new file with mode: 0644]
queue-4.3/nfs4-limit-callback-decoding-to-received-bytes.patch [new file with mode: 0644]
queue-4.3/nfs4-resend-layoutget-when-there-is-a-race-that-changes-the-seqid.patch [new file with mode: 0644]
queue-4.3/nfs4-start-callback_ident-at-idr-1.patch [new file with mode: 0644]
queue-4.3/nfsd-eliminate-sending-duplicate-and-repeated-delegations.patch [new file with mode: 0644]
queue-4.3/nfsd-serialize-state-seqid-morphing-operations.patch [new file with mode: 0644]
queue-4.3/ocfs2-fix-umask-ignored-issue.patch [new file with mode: 0644]
queue-4.3/rbd-don-t-put-snap_context-twice-in-rbd_queue_workfn.patch [new file with mode: 0644]
queue-4.3/series

diff --git a/queue-4.3/alsa-hda-hdmi-apply-skylake-fix-ups-to-broxton-display-codec.patch b/queue-4.3/alsa-hda-hdmi-apply-skylake-fix-ups-to-broxton-display-codec.patch
new file mode 100644 (file)
index 0000000..a75e0bd
--- /dev/null
@@ -0,0 +1,33 @@
+From e2656412f2a7343ecfd13eb74bac0a6e6e9c5aad Mon Sep 17 00:00:00 2001
+From: "Lu, Han" <han.lu@intel.com>
+Date: Wed, 11 Nov 2015 16:54:27 +0800
+Subject: ALSA: hda/hdmi - apply Skylake fix-ups to Broxton display codec
+
+From: "Lu, Han" <han.lu@intel.com>
+
+commit e2656412f2a7343ecfd13eb74bac0a6e6e9c5aad upstream.
+
+Broxton and Skylake have the same behavior on display audio. So this patch
+applys Skylake fix-ups to Broxton.
+
+Signed-off-by: Lu, Han <han.lu@intel.com>
+Signed-off-by: Takashi Iwai <tiwai@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ sound/pci/hda/patch_hdmi.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/sound/pci/hda/patch_hdmi.c
++++ b/sound/pci/hda/patch_hdmi.c
+@@ -50,8 +50,9 @@ MODULE_PARM_DESC(static_hdmi_pcm, "Don't
+ #define is_haswell(codec)  ((codec)->core.vendor_id == 0x80862807)
+ #define is_broadwell(codec)    ((codec)->core.vendor_id == 0x80862808)
+ #define is_skylake(codec) ((codec)->core.vendor_id == 0x80862809)
++#define is_broxton(codec) ((codec)->core.vendor_id == 0x8086280a)
+ #define is_haswell_plus(codec) (is_haswell(codec) || is_broadwell(codec) \
+-                                      || is_skylake(codec))
++                              || is_skylake(codec) || is_broxton(codec))
+ #define is_valleyview(codec) ((codec)->core.vendor_id == 0x80862882)
+ #define is_cherryview(codec) ((codec)->core.vendor_id == 0x80862883)
diff --git a/queue-4.3/alsa-pci-depend-on-zone_dma.patch b/queue-4.3/alsa-pci-depend-on-zone_dma.patch
new file mode 100644 (file)
index 0000000..655bdd9
--- /dev/null
@@ -0,0 +1,135 @@
+From 2db1a57986d37653583e67ccbf13082aadc8f25d Mon Sep 17 00:00:00 2001
+From: Dan Williams <dan.j.williams@intel.com>
+Date: Thu, 12 Nov 2015 12:13:57 -0800
+Subject: ALSA: pci: depend on ZONE_DMA
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+commit 2db1a57986d37653583e67ccbf13082aadc8f25d upstream.
+
+There are several sound drivers that 'select ZONE_DMA'.  This is
+backwards as ZONE_DMA is an architecture capability exported to drivers.
+Switch the polarity of the dependency to disable these drivers when the
+architecture does not support ZONE_DMA.  This was discovered in the
+context of testing/enabling devm_memremap_pages() which depends on
+ZONE_DEVICE.  ZONE_DEVICE in turn depends on !ZONE_DMA.
+
+Reported-by: Jeff Moyer <jmoyer@redhat.com>
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Signed-off-by: Takashi Iwai <tiwai@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ sound/pci/Kconfig |   24 ++++++++++++------------
+ 1 file changed, 12 insertions(+), 12 deletions(-)
+
+--- a/sound/pci/Kconfig
++++ b/sound/pci/Kconfig
+@@ -25,7 +25,7 @@ config SND_ALS300
+       select SND_PCM
+       select SND_AC97_CODEC
+       select SND_OPL3_LIB
+-      select ZONE_DMA
++      depends on ZONE_DMA
+       help
+         Say 'Y' or 'M' to include support for Avance Logic ALS300/ALS300+
+@@ -50,7 +50,7 @@ config SND_ALI5451
+       tristate "ALi M5451 PCI Audio Controller"
+       select SND_MPU401_UART
+       select SND_AC97_CODEC
+-      select ZONE_DMA
++      depends on ZONE_DMA
+       help
+         Say Y here to include support for the integrated AC97 sound
+         device on motherboards using the ALi M5451 Audio Controller
+@@ -155,7 +155,7 @@ config SND_AZT3328
+       select SND_PCM
+       select SND_RAWMIDI
+       select SND_AC97_CODEC
+-      select ZONE_DMA
++      depends on ZONE_DMA
+       help
+         Say Y here to include support for Aztech AZF3328 (PCI168)
+         soundcards.
+@@ -463,7 +463,7 @@ config SND_EMU10K1
+       select SND_HWDEP
+       select SND_RAWMIDI
+       select SND_AC97_CODEC
+-      select ZONE_DMA
++      depends on ZONE_DMA
+       help
+         Say Y to include support for Sound Blaster PCI 512, Live!,
+         Audigy and E-mu APS (partially supported) soundcards.
+@@ -479,7 +479,7 @@ config SND_EMU10K1X
+       tristate "Emu10k1X (Dell OEM Version)"
+       select SND_AC97_CODEC
+       select SND_RAWMIDI
+-      select ZONE_DMA
++      depends on ZONE_DMA
+       help
+         Say Y here to include support for the Dell OEM version of the
+         Sound Blaster Live!.
+@@ -513,7 +513,7 @@ config SND_ES1938
+       select SND_OPL3_LIB
+       select SND_MPU401_UART
+       select SND_AC97_CODEC
+-      select ZONE_DMA
++      depends on ZONE_DMA
+       help
+         Say Y here to include support for soundcards based on ESS Solo-1
+         (ES1938, ES1946, ES1969) chips.
+@@ -525,7 +525,7 @@ config SND_ES1968
+       tristate "ESS ES1968/1978 (Maestro-1/2/2E)"
+       select SND_MPU401_UART
+       select SND_AC97_CODEC
+-      select ZONE_DMA
++      depends on ZONE_DMA
+       help
+         Say Y here to include support for soundcards based on ESS Maestro
+         1/2/2E chips.
+@@ -612,7 +612,7 @@ config SND_ICE1712
+       select SND_MPU401_UART
+       select SND_AC97_CODEC
+       select BITREVERSE
+-      select ZONE_DMA
++      depends on ZONE_DMA
+       help
+         Say Y here to include support for soundcards based on the
+         ICE1712 (Envy24) chip.
+@@ -700,7 +700,7 @@ config SND_LX6464ES
+ config SND_MAESTRO3
+       tristate "ESS Allegro/Maestro3"
+       select SND_AC97_CODEC
+-      select ZONE_DMA
++      depends on ZONE_DMA
+       help
+         Say Y here to include support for soundcards based on ESS Maestro 3
+         (Allegro) chips.
+@@ -806,7 +806,7 @@ config SND_SIS7019
+       tristate "SiS 7019 Audio Accelerator"
+       depends on X86_32
+       select SND_AC97_CODEC
+-      select ZONE_DMA
++      depends on ZONE_DMA
+       help
+         Say Y here to include support for the SiS 7019 Audio Accelerator.
+@@ -818,7 +818,7 @@ config SND_SONICVIBES
+       select SND_OPL3_LIB
+       select SND_MPU401_UART
+       select SND_AC97_CODEC
+-      select ZONE_DMA
++      depends on ZONE_DMA
+       help
+         Say Y here to include support for soundcards based on the S3
+         SonicVibes chip.
+@@ -830,7 +830,7 @@ config SND_TRIDENT
+       tristate "Trident 4D-Wave DX/NX; SiS 7018"
+       select SND_MPU401_UART
+       select SND_AC97_CODEC
+-      select ZONE_DMA
++      depends on ZONE_DMA
+       help
+         Say Y here to include support for soundcards based on Trident
+         4D-Wave DX/NX or SiS 7018 chips.
diff --git a/queue-4.3/block-fix-segment-split.patch b/queue-4.3/block-fix-segment-split.patch
new file mode 100644 (file)
index 0000000..b4abcb2
--- /dev/null
@@ -0,0 +1,46 @@
+From 578270bfbd2803dc7b0b03fbc2ac119efbc73195 Mon Sep 17 00:00:00 2001
+From: Ming Lei <ming.lei@canonical.com>
+Date: Tue, 24 Nov 2015 10:35:29 +0800
+Subject: block: fix segment split
+
+From: Ming Lei <ming.lei@canonical.com>
+
+commit 578270bfbd2803dc7b0b03fbc2ac119efbc73195 upstream.
+
+Inside blk_bio_segment_split(), previous bvec pointer(bvprvp)
+always points to the iterator local variable, which is obviously
+wrong, so fix it by pointing to the local variable of 'bvprv'.
+
+Fixes: 5014c311baa2b(block: fix bogus compiler warnings in blk-merge.c)
+Reported-by: Michael Ellerman <mpe@ellerman.id.au>
+Reported-by: Mark Salter <msalter@redhat.com>
+Tested-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
+Tested-by: Mark Salter <msalter@redhat.com>
+Signed-off-by: Ming Lei <ming.lei@canonical.com>
+Signed-off-by: Jens Axboe <axboe@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ block/blk-merge.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/block/blk-merge.c
++++ b/block/blk-merge.c
+@@ -91,7 +91,7 @@ static struct bio *blk_bio_segment_split
+                       seg_size += bv.bv_len;
+                       bvprv = bv;
+-                      bvprvp = &bv;
++                      bvprvp = &bvprv;
+                       sectors += bv.bv_len >> 9;
+                       continue;
+               }
+@@ -101,7 +101,7 @@ new_segment:
+               nsegs++;
+               bvprv = bv;
+-              bvprvp = &bv;
++              bvprvp = &bvprv;
+               seg_size = bv.bv_len;
+               sectors += bv.bv_len >> 9;
+       }
diff --git a/queue-4.3/btrfs-check-unsupported-filters-in-balance-arguments.patch b/queue-4.3/btrfs-check-unsupported-filters-in-balance-arguments.patch
new file mode 100644 (file)
index 0000000..164a9ba
--- /dev/null
@@ -0,0 +1,57 @@
+From 849ef9286f30c88113906dc35f44a499c0cb385d Mon Sep 17 00:00:00 2001
+From: David Sterba <dsterba@suse.com>
+Date: Mon, 12 Oct 2015 16:55:54 +0200
+Subject: btrfs: check unsupported filters in balance arguments
+
+From: David Sterba <dsterba@suse.com>
+
+commit 849ef9286f30c88113906dc35f44a499c0cb385d upstream.
+
+We don't verify that all the balance filter arguments supplemented by
+the flags are actually known to the kernel. Thus we let it silently pass
+and do nothing.
+
+At the moment this means only the 'limit' filter, but we're going to add
+a few more soon so it's better to have that fixed. Also in older stable
+kernels so that it works with newer userspace tools.
+
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Chris Mason <clm@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/ioctl.c   |    5 +++++
+ fs/btrfs/volumes.h |    8 ++++++++
+ 2 files changed, 13 insertions(+)
+
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -4644,6 +4644,11 @@ locked:
+               goto out_bctl;
+       }
++      if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
++              ret = -EINVAL;
++              goto out_bargs;
++      }
++
+ do_balance:
+       /*
+        * Ownership of bctl and mutually_exclusive_operation_running
+--- a/fs/btrfs/volumes.h
++++ b/fs/btrfs/volumes.h
+@@ -384,6 +384,14 @@ struct map_lookup {
+        BTRFS_BALANCE_ARGS_VRANGE |            \
+        BTRFS_BALANCE_ARGS_LIMIT)
++#define BTRFS_BALANCE_ARGS_MASK                       \
++      (BTRFS_BALANCE_ARGS_PROFILES |          \
++       BTRFS_BALANCE_ARGS_USAGE |             \
++       BTRFS_BALANCE_ARGS_DEVID |             \
++       BTRFS_BALANCE_ARGS_DRANGE |            \
++       BTRFS_BALANCE_ARGS_VRANGE |            \
++       BTRFS_BALANCE_ARGS_LIMIT)
++
+ /*
+  * Profile changing flags.  When SOFT is set we won't relocate chunk if
+  * it already has the target profile (even though it may be
diff --git a/queue-4.3/btrfs-fix-file-corruption-and-data-loss-after-cloning-inline-extents.patch b/queue-4.3/btrfs-fix-file-corruption-and-data-loss-after-cloning-inline-extents.patch
new file mode 100644 (file)
index 0000000..6ddaed6
--- /dev/null
@@ -0,0 +1,443 @@
+From 8039d87d9e473aeb740d4fdbd59b9d2f89b2ced9 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 13 Oct 2015 15:15:00 +0100
+Subject: Btrfs: fix file corruption and data loss after cloning inline extents
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 8039d87d9e473aeb740d4fdbd59b9d2f89b2ced9 upstream.
+
+Currently the clone ioctl allows to clone an inline extent from one file
+to another that already has other (non-inlined) extents. This is a problem
+because btrfs is not designed to deal with files having inline and regular
+extents, if a file has an inline extent then it must be the only extent
+in the file and must start at file offset 0. Having a file with an inline
+extent followed by regular extents results in EIO errors when doing reads
+or writes against the first 4K of the file.
+
+Also, the clone ioctl allows one to lose data if the source file consists
+of a single inline extent, with a size of N bytes, and the destination
+file consists of a single inline extent with a size of M bytes, where we
+have M > N. In this case the clone operation removes the inline extent
+from the destination file and then copies the inline extent from the
+source file into the destination file - we lose the M - N bytes from the
+destination file, a read operation will get the value 0x00 for any bytes
+in the the range [N, M] (the destination inode's i_size remained as M,
+that's why we can read past N bytes).
+
+So fix this by not allowing such destructive operations to happen and
+return errno EOPNOTSUPP to user space.
+
+Currently the fstest btrfs/035 tests the data loss case but it totally
+ignores this - i.e. expects the operation to succeed and does not check
+the we got data loss.
+
+The following test case for fstests exercises all these cases that result
+in file corruption and data loss:
+
+  seq=`basename $0`
+  seqres=$RESULT_DIR/$seq
+  echo "QA output created by $seq"
+  tmp=/tmp/$$
+  status=1     # failure is the default!
+  trap "_cleanup; exit \$status" 0 1 2 3 15
+
+  _cleanup()
+  {
+      rm -f $tmp.*
+  }
+
+  # get standard environment, filters and checks
+  . ./common/rc
+  . ./common/filter
+
+  # real QA test starts here
+  _need_to_be_root
+  _supported_fs btrfs
+  _supported_os Linux
+  _require_scratch
+  _require_cloner
+  _require_btrfs_fs_feature "no_holes"
+  _require_btrfs_mkfs_feature "no-holes"
+
+  rm -f $seqres.full
+
+  test_cloning_inline_extents()
+  {
+      local mkfs_opts=$1
+      local mount_opts=$2
+
+      _scratch_mkfs $mkfs_opts >>$seqres.full 2>&1
+      _scratch_mount $mount_opts
+
+      # File bar, the source for all the following clone operations, consists
+      # of a single inline extent (50 bytes).
+      $XFS_IO_PROG -f -c "pwrite -S 0xbb 0 50" $SCRATCH_MNT/bar \
+          | _filter_xfs_io
+
+      # Test cloning into a file with an extent (non-inlined) where the
+      # destination offset overlaps that extent. It should not be possible to
+      # clone the inline extent from file bar into this file.
+      $XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 16K" $SCRATCH_MNT/foo \
+          | _filter_xfs_io
+      $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo
+
+      # Doing IO against any range in the first 4K of the file should work.
+      # Due to a past clone ioctl bug which allowed cloning the inline extent,
+      # these operations resulted in EIO errors.
+      echo "File foo data after clone operation:"
+      # All bytes should have the value 0xaa (clone operation failed and did
+      # not modify our file).
+      od -t x1 $SCRATCH_MNT/foo
+      $XFS_IO_PROG -c "pwrite -S 0xcc 0 100" $SCRATCH_MNT/foo | _filter_xfs_io
+
+      # Test cloning the inline extent against a file which has a hole in its
+      # first 4K followed by a non-inlined extent. It should not be possible
+      # as well to clone the inline extent from file bar into this file.
+      $XFS_IO_PROG -f -c "pwrite -S 0xdd 4K 12K" $SCRATCH_MNT/foo2 \
+          | _filter_xfs_io
+      $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo2
+
+      # Doing IO against any range in the first 4K of the file should work.
+      # Due to a past clone ioctl bug which allowed cloning the inline extent,
+      # these operations resulted in EIO errors.
+      echo "File foo2 data after clone operation:"
+      # All bytes should have the value 0x00 (clone operation failed and did
+      # not modify our file).
+      od -t x1 $SCRATCH_MNT/foo2
+      $XFS_IO_PROG -c "pwrite -S 0xee 0 90" $SCRATCH_MNT/foo2 | _filter_xfs_io
+
+      # Test cloning the inline extent against a file which has a size of zero
+      # but has a prealloc extent. It should not be possible as well to clone
+      # the inline extent from file bar into this file.
+      $XFS_IO_PROG -f -c "falloc -k 0 1M" $SCRATCH_MNT/foo3 | _filter_xfs_io
+      $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo3
+
+      # Doing IO against any range in the first 4K of the file should work.
+      # Due to a past clone ioctl bug which allowed cloning the inline extent,
+      # these operations resulted in EIO errors.
+      echo "First 50 bytes of foo3 after clone operation:"
+      # Should not be able to read any bytes, file has 0 bytes i_size (the
+      # clone operation failed and did not modify our file).
+      od -t x1 $SCRATCH_MNT/foo3
+      $XFS_IO_PROG -c "pwrite -S 0xff 0 90" $SCRATCH_MNT/foo3 | _filter_xfs_io
+
+      # Test cloning the inline extent against a file which consists of a
+      # single inline extent that has a size not greater than the size of
+      # bar's inline extent (40 < 50).
+      # It should be possible to do the extent cloning from bar to this file.
+      $XFS_IO_PROG -f -c "pwrite -S 0x01 0 40" $SCRATCH_MNT/foo4 \
+          | _filter_xfs_io
+      $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo4
+
+      # Doing IO against any range in the first 4K of the file should work.
+      echo "File foo4 data after clone operation:"
+      # Must match file bar's content.
+      od -t x1 $SCRATCH_MNT/foo4
+      $XFS_IO_PROG -c "pwrite -S 0x02 0 90" $SCRATCH_MNT/foo4 | _filter_xfs_io
+
+      # Test cloning the inline extent against a file which consists of a
+      # single inline extent that has a size greater than the size of bar's
+      # inline extent (60 > 50).
+      # It should not be possible to clone the inline extent from file bar
+      # into this file.
+      $XFS_IO_PROG -f -c "pwrite -S 0x03 0 60" $SCRATCH_MNT/foo5 \
+          | _filter_xfs_io
+      $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo5
+
+      # Reading the file should not fail.
+      echo "File foo5 data after clone operation:"
+      # Must have a size of 60 bytes, with all bytes having a value of 0x03
+      # (the clone operation failed and did not modify our file).
+      od -t x1 $SCRATCH_MNT/foo5
+
+      # Test cloning the inline extent against a file which has no extents but
+      # has a size greater than bar's inline extent (16K > 50).
+      # It should not be possible to clone the inline extent from file bar
+      # into this file.
+      $XFS_IO_PROG -f -c "truncate 16K" $SCRATCH_MNT/foo6 | _filter_xfs_io
+      $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo6
+
+      # Reading the file should not fail.
+      echo "File foo6 data after clone operation:"
+      # Must have a size of 16K, with all bytes having a value of 0x00 (the
+      # clone operation failed and did not modify our file).
+      od -t x1 $SCRATCH_MNT/foo6
+
+      # Test cloning the inline extent against a file which has no extents but
+      # has a size not greater than bar's inline extent (30 < 50).
+      # It should be possible to clone the inline extent from file bar into
+      # this file.
+      $XFS_IO_PROG -f -c "truncate 30" $SCRATCH_MNT/foo7 | _filter_xfs_io
+      $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo7
+
+      # Reading the file should not fail.
+      echo "File foo7 data after clone operation:"
+      # Must have a size of 50 bytes, with all bytes having a value of 0xbb.
+      od -t x1 $SCRATCH_MNT/foo7
+
+      # Test cloning the inline extent against a file which has a size not
+      # greater than the size of bar's inline extent (20 < 50) but has
+      # a prealloc extent that goes beyond the file's size. It should not be
+      # possible to clone the inline extent from bar into this file.
+      $XFS_IO_PROG -f -c "falloc -k 0 1M" \
+                      -c "pwrite -S 0x88 0 20" \
+                      $SCRATCH_MNT/foo8 | _filter_xfs_io
+      $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo8
+
+      echo "File foo8 data after clone operation:"
+      # Must have a size of 20 bytes, with all bytes having a value of 0x88
+      # (the clone operation did not modify our file).
+      od -t x1 $SCRATCH_MNT/foo8
+
+      _scratch_unmount
+  }
+
+  echo -e "\nTesting without compression and without the no-holes feature...\n"
+  test_cloning_inline_extents
+
+  echo -e "\nTesting with compression and without the no-holes feature...\n"
+  test_cloning_inline_extents "" "-o compress"
+
+  echo -e "\nTesting without compression and with the no-holes feature...\n"
+  test_cloning_inline_extents "-O no-holes" ""
+
+  echo -e "\nTesting with compression and with the no-holes feature...\n"
+  test_cloning_inline_extents "-O no-holes" "-o compress"
+
+  status=0
+  exit
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/ioctl.c |  195 ++++++++++++++++++++++++++++++++++++++++++-------------
+ 1 file changed, 152 insertions(+), 43 deletions(-)
+
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -3328,6 +3328,150 @@ static void clone_update_extent_map(stru
+                       &BTRFS_I(inode)->runtime_flags);
+ }
++/*
++ * Make sure we do not end up inserting an inline extent into a file that has
++ * already other (non-inline) extents. If a file has an inline extent it can
++ * not have any other extents and the (single) inline extent must start at the
++ * file offset 0. Failing to respect these rules will lead to file corruption,
++ * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
++ *
++ * We can have extents that have been already written to disk or we can have
++ * dirty ranges still in delalloc, in which case the extent maps and items are
++ * created only when we run delalloc, and the delalloc ranges might fall outside
++ * the range we are currently locking in the inode's io tree. So we check the
++ * inode's i_size because of that (i_size updates are done while holding the
++ * i_mutex, which we are holding here).
++ * We also check to see if the inode has a size not greater than "datal" but has
++ * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
++ * protected against such concurrent fallocate calls by the i_mutex).
++ *
++ * If the file has no extents but a size greater than datal, do not allow the
++ * copy because we would need turn the inline extent into a non-inline one (even
++ * with NO_HOLES enabled). If we find our destination inode only has one inline
++ * extent, just overwrite it with the source inline extent if its size is less
++ * than the source extent's size, or we could copy the source inline extent's
++ * data into the destination inode's inline extent if the later is greater then
++ * the former.
++ */
++static int clone_copy_inline_extent(struct inode *src,
++                                  struct inode *dst,
++                                  struct btrfs_trans_handle *trans,
++                                  struct btrfs_path *path,
++                                  struct btrfs_key *new_key,
++                                  const u64 drop_start,
++                                  const u64 datal,
++                                  const u64 skip,
++                                  const u64 size,
++                                  char *inline_data)
++{
++      struct btrfs_root *root = BTRFS_I(dst)->root;
++      const u64 aligned_end = ALIGN(new_key->offset + datal,
++                                    root->sectorsize);
++      int ret;
++      struct btrfs_key key;
++
++      if (new_key->offset > 0)
++              return -EOPNOTSUPP;
++
++      key.objectid = btrfs_ino(dst);
++      key.type = BTRFS_EXTENT_DATA_KEY;
++      key.offset = 0;
++      ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
++      if (ret < 0) {
++              return ret;
++      } else if (ret > 0) {
++              if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
++                      ret = btrfs_next_leaf(root, path);
++                      if (ret < 0)
++                              return ret;
++                      else if (ret > 0)
++                              goto copy_inline_extent;
++              }
++              btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
++              if (key.objectid == btrfs_ino(dst) &&
++                  key.type == BTRFS_EXTENT_DATA_KEY) {
++                      ASSERT(key.offset > 0);
++                      return -EOPNOTSUPP;
++              }
++      } else if (i_size_read(dst) <= datal) {
++              struct btrfs_file_extent_item *ei;
++              u64 ext_len;
++
++              /*
++               * If the file size is <= datal, make sure there are no other
++               * extents following (can happen do to an fallocate call with
++               * the flag FALLOC_FL_KEEP_SIZE).
++               */
++              ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
++                                  struct btrfs_file_extent_item);
++              /*
++               * If it's an inline extent, it can not have other extents
++               * following it.
++               */
++              if (btrfs_file_extent_type(path->nodes[0], ei) ==
++                  BTRFS_FILE_EXTENT_INLINE)
++                      goto copy_inline_extent;
++
++              ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
++              if (ext_len > aligned_end)
++                      return -EOPNOTSUPP;
++
++              ret = btrfs_next_item(root, path);
++              if (ret < 0) {
++                      return ret;
++              } else if (ret == 0) {
++                      btrfs_item_key_to_cpu(path->nodes[0], &key,
++                                            path->slots[0]);
++                      if (key.objectid == btrfs_ino(dst) &&
++                          key.type == BTRFS_EXTENT_DATA_KEY)
++                              return -EOPNOTSUPP;
++              }
++      }
++
++copy_inline_extent:
++      /*
++       * We have no extent items, or we have an extent at offset 0 which may
++       * or may not be inlined. All these cases are dealt the same way.
++       */
++      if (i_size_read(dst) > datal) {
++              /*
++               * If the destination inode has an inline extent...
++               * This would require copying the data from the source inline
++               * extent into the beginning of the destination's inline extent.
++               * But this is really complex, both extents can be compressed
++               * or just one of them, which would require decompressing and
++               * re-compressing data (which could increase the new compressed
++               * size, not allowing the compressed data to fit anymore in an
++               * inline extent).
++               * So just don't support this case for now (it should be rare,
++               * we are not really saving space when cloning inline extents).
++               */
++              return -EOPNOTSUPP;
++      }
++
++      btrfs_release_path(path);
++      ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
++      if (ret)
++              return ret;
++      ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
++      if (ret)
++              return ret;
++
++      if (skip) {
++              const u32 start = btrfs_file_extent_calc_inline_size(0);
++
++              memmove(inline_data + start, inline_data + start + skip, datal);
++      }
++
++      write_extent_buffer(path->nodes[0], inline_data,
++                          btrfs_item_ptr_offset(path->nodes[0],
++                                                path->slots[0]),
++                          size);
++      inode_add_bytes(dst, datal);
++
++      return 0;
++}
++
+ /**
+  * btrfs_clone() - clone a range from inode file to another
+  *
+@@ -3594,21 +3738,6 @@ process_slot:
+                       } else if (type == BTRFS_FILE_EXTENT_INLINE) {
+                               u64 skip = 0;
+                               u64 trim = 0;
+-                              u64 aligned_end = 0;
+-
+-                              /*
+-                               * Don't copy an inline extent into an offset
+-                               * greater than zero. Having an inline extent
+-                               * at such an offset results in chaos as btrfs
+-                               * isn't prepared for such cases. Just skip
+-                               * this case for the same reasons as commented
+-                               * at btrfs_ioctl_clone().
+-                               */
+-                              if (last_dest_end > 0) {
+-                                      ret = -EOPNOTSUPP;
+-                                      btrfs_end_transaction(trans, root);
+-                                      goto out;
+-                              }
+                               if (off > key.offset) {
+                                       skip = off - key.offset;
+@@ -3626,42 +3755,22 @@ process_slot:
+                               size -= skip + trim;
+                               datal -= skip + trim;
+-                              aligned_end = ALIGN(new_key.offset + datal,
+-                                                  root->sectorsize);
+-                              ret = btrfs_drop_extents(trans, root, inode,
+-                                                       drop_start,
+-                                                       aligned_end,
+-                                                       1);
++                              ret = clone_copy_inline_extent(src, inode,
++                                                             trans, path,
++                                                             &new_key,
++                                                             drop_start,
++                                                             datal,
++                                                             skip, size, buf);
+                               if (ret) {
+                                       if (ret != -EOPNOTSUPP)
+                                               btrfs_abort_transaction(trans,
+-                                                      root, ret);
+-                                      btrfs_end_transaction(trans, root);
+-                                      goto out;
+-                              }
+-
+-                              ret = btrfs_insert_empty_item(trans, root, path,
+-                                                            &new_key, size);
+-                              if (ret) {
+-                                      btrfs_abort_transaction(trans, root,
+-                                                              ret);
++                                                                      root,
++                                                                      ret);
+                                       btrfs_end_transaction(trans, root);
+                                       goto out;
+                               }
+-
+-                              if (skip) {
+-                                      u32 start =
+-                                        btrfs_file_extent_calc_inline_size(0);
+-                                      memmove(buf+start, buf+start+skip,
+-                                              datal);
+-                              }
+-
+                               leaf = path->nodes[0];
+                               slot = path->slots[0];
+-                              write_extent_buffer(leaf, buf,
+-                                          btrfs_item_ptr_offset(leaf, slot),
+-                                          size);
+-                              inode_add_bytes(inode, datal);
+                       }
+                       /* If we have an implicit hole (NO_HOLES feature). */
diff --git a/queue-4.3/btrfs-fix-race-leading-to-bug_on-when-running-delalloc-for-nodatacow.patch b/queue-4.3/btrfs-fix-race-leading-to-bug_on-when-running-delalloc-for-nodatacow.patch
new file mode 100644 (file)
index 0000000..b613df9
--- /dev/null
@@ -0,0 +1,122 @@
+From 1d512cb77bdbda80f0dd0620a3b260d697fd581d Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 9 Nov 2015 00:33:58 +0000
+Subject: Btrfs: fix race leading to BUG_ON when running delalloc for nodatacow
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 1d512cb77bdbda80f0dd0620a3b260d697fd581d upstream.
+
+If we are using the NO_HOLES feature, we have a tiny time window when
+running delalloc for a nodatacow inode where we can race with a concurrent
+link or xattr add operation leading to a BUG_ON.
+
+This happens because at run_delalloc_nocow() we end up casting a leaf item
+of type BTRFS_INODE_[REF|EXTREF]_KEY or of type BTRFS_XATTR_ITEM_KEY to a
+file extent item (struct btrfs_file_extent_item) and then analyse its
+extent type field, which won't match any of the expected extent types
+(values BTRFS_FILE_EXTENT_[REG|PREALLOC|INLINE]) and therefore trigger an
+explicit BUG_ON(1).
+
+The following sequence diagram shows how the race happens when running a
+no-cow dellaloc range [4K, 8K[ for inode 257 and we have the following
+neighbour leafs:
+
+             Leaf X (has N items)                    Leaf Y
+
+ [ ... (257 INODE_ITEM 0) (257 INODE_REF 256) ]  [ (257 EXTENT_DATA 8192), ... ]
+              slot N - 2         slot N - 1              slot 0
+
+ (Note the implicit hole for inode 257 regarding the [0, 8K[ range)
+
+       CPU 1                                         CPU 2
+
+ run_dealloc_nocow()
+   btrfs_lookup_file_extent()
+     --> searches for a key with value
+         (257 EXTENT_DATA 4096) in the
+         fs/subvol tree
+     --> returns us a path with
+         path->nodes[0] == leaf X and
+         path->slots[0] == N
+
+   because path->slots[0] is >=
+   btrfs_header_nritems(leaf X), it
+   calls btrfs_next_leaf()
+
+   btrfs_next_leaf()
+     --> releases the path
+
+                                              hard link added to our inode,
+                                              with key (257 INODE_REF 500)
+                                              added to the end of leaf X,
+                                              so leaf X now has N + 1 keys
+
+     --> searches for the key
+         (257 INODE_REF 256), because
+         it was the last key in leaf X
+         before it released the path,
+         with path->keep_locks set to 1
+
+     --> ends up at leaf X again and
+         it verifies that the key
+         (257 INODE_REF 256) is no longer
+         the last key in the leaf, so it
+         returns with path->nodes[0] ==
+         leaf X and path->slots[0] == N,
+         pointing to the new item with
+         key (257 INODE_REF 500)
+
+   the loop iteration of run_dealloc_nocow()
+   does not break out the loop and continues
+   because the key referenced in the path
+   at path->nodes[0] and path->slots[0] is
+   for inode 257, its type is < BTRFS_EXTENT_DATA_KEY
+   and its offset (500) is less then our delalloc
+   range's end (8192)
+
+   the item pointed by the path, an inode reference item,
+   is (incorrectly) interpreted as a file extent item and
+   we get an invalid extent type, leading to the BUG_ON(1):
+
+   if (extent_type == BTRFS_FILE_EXTENT_REG ||
+      extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+       (...)
+   } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+       (...)
+   } else {
+       BUG_ON(1)
+   }
+
+The same can happen if a xattr is added concurrently and ends up having
+a key with an offset smaller then the delalloc's range end.
+
+So fix this by skipping keys with a type smaller than
+BTRFS_EXTENT_DATA_KEY.
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/inode.c |   10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -1294,8 +1294,14 @@ next_slot:
+               num_bytes = 0;
+               btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+-              if (found_key.objectid > ino ||
+-                  found_key.type > BTRFS_EXTENT_DATA_KEY ||
++              if (found_key.objectid > ino)
++                      break;
++              if (WARN_ON_ONCE(found_key.objectid < ino) ||
++                  found_key.type < BTRFS_EXTENT_DATA_KEY) {
++                      path->slots[0]++;
++                      goto next_slot;
++              }
++              if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
+                   found_key.offset > end)
+                       break;
diff --git a/queue-4.3/btrfs-fix-race-leading-to-incorrect-item-deletion-when-dropping-extents.patch b/queue-4.3/btrfs-fix-race-leading-to-incorrect-item-deletion-when-dropping-extents.patch
new file mode 100644 (file)
index 0000000..3211eec
--- /dev/null
@@ -0,0 +1,198 @@
+From aeafbf8486c9e2bd53f5cc3c10c0b7fd7149d69c Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Fri, 6 Nov 2015 13:33:33 +0000
+Subject: Btrfs: fix race leading to incorrect item deletion when dropping extents
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit aeafbf8486c9e2bd53f5cc3c10c0b7fd7149d69c upstream.
+
+While running a stress test I got the following warning triggered:
+
+  [191627.672810] ------------[ cut here ]------------
+  [191627.673949] WARNING: CPU: 8 PID: 8447 at fs/btrfs/file.c:779 __btrfs_drop_extents+0x391/0xa50 [btrfs]()
+  (...)
+  [191627.701485] Call Trace:
+  [191627.702037]  [<ffffffff8145f077>] dump_stack+0x4f/0x7b
+  [191627.702992]  [<ffffffff81095de5>] ? console_unlock+0x356/0x3a2
+  [191627.704091]  [<ffffffff8104b3b0>] warn_slowpath_common+0xa1/0xbb
+  [191627.705380]  [<ffffffffa0664499>] ? __btrfs_drop_extents+0x391/0xa50 [btrfs]
+  [191627.706637]  [<ffffffff8104b46d>] warn_slowpath_null+0x1a/0x1c
+  [191627.707789]  [<ffffffffa0664499>] __btrfs_drop_extents+0x391/0xa50 [btrfs]
+  [191627.709155]  [<ffffffff8115663c>] ? cache_alloc_debugcheck_after.isra.32+0x171/0x1d0
+  [191627.712444]  [<ffffffff81155007>] ? kmemleak_alloc_recursive.constprop.40+0x16/0x18
+  [191627.714162]  [<ffffffffa06570c9>] insert_reserved_file_extent.constprop.40+0x83/0x24e [btrfs]
+  [191627.715887]  [<ffffffffa065422b>] ? start_transaction+0x3bb/0x610 [btrfs]
+  [191627.717287]  [<ffffffffa065b604>] btrfs_finish_ordered_io+0x273/0x4e2 [btrfs]
+  [191627.728865]  [<ffffffffa065b888>] finish_ordered_fn+0x15/0x17 [btrfs]
+  [191627.730045]  [<ffffffffa067d688>] normal_work_helper+0x14c/0x32c [btrfs]
+  [191627.731256]  [<ffffffffa067d96a>] btrfs_endio_write_helper+0x12/0x14 [btrfs]
+  [191627.732661]  [<ffffffff81061119>] process_one_work+0x24c/0x4ae
+  [191627.733822]  [<ffffffff810615b0>] worker_thread+0x206/0x2c2
+  [191627.734857]  [<ffffffff810613aa>] ? process_scheduled_works+0x2f/0x2f
+  [191627.736052]  [<ffffffff810613aa>] ? process_scheduled_works+0x2f/0x2f
+  [191627.737349]  [<ffffffff810669a6>] kthread+0xef/0xf7
+  [191627.738267]  [<ffffffff810f3b3a>] ? time_hardirqs_on+0x15/0x28
+  [191627.739330]  [<ffffffff810668b7>] ? __kthread_parkme+0xad/0xad
+  [191627.741976]  [<ffffffff81465592>] ret_from_fork+0x42/0x70
+  [191627.743080]  [<ffffffff810668b7>] ? __kthread_parkme+0xad/0xad
+  [191627.744206] ---[ end trace bbfddacb7aaada8d ]---
+
+  $ cat -n fs/btrfs/file.c
+  691  int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
+  (...)
+  758                  btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+  759                  if (key.objectid > ino ||
+  760                      key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
+  761                          break;
+  762
+  763                  fi = btrfs_item_ptr(leaf, path->slots[0],
+  764                                      struct btrfs_file_extent_item);
+  765                  extent_type = btrfs_file_extent_type(leaf, fi);
+  766
+  767                  if (extent_type == BTRFS_FILE_EXTENT_REG ||
+  768                      extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
+  (...)
+  774                  } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+  (...)
+  778                  } else {
+  779                          WARN_ON(1);
+  780                          extent_end = search_start;
+  781                  }
+  (...)
+
+This happened because the item we were processing did not match a file
+extent item (its key type != BTRFS_EXTENT_DATA_KEY), and even on this
+case we cast the item to a struct btrfs_file_extent_item pointer and
+then find a type field value that does not match any of the expected
+values (BTRFS_FILE_EXTENT_[REG|PREALLOC|INLINE]). This scenario happens
+due to a tiny time window where a race can happen as exemplified below.
+For example, consider the following scenario where we're using the
+NO_HOLES feature and we have the following two neighbour leafs:
+
+               Leaf X (has N items)                    Leaf Y
+
+[ ... (257 INODE_ITEM 0) (257 INODE_REF 256) ]  [ (257 EXTENT_DATA 8192), ... ]
+          slot N - 2         slot N - 1              slot 0
+
+Our inode 257 has an implicit hole in the range [0, 8K[ (implicit rather
+than explicit because NO_HOLES is enabled). Now if our inode has an
+ordered extent for the range [4K, 8K[ that is finishing, the following
+can happen:
+
+          CPU 1                                       CPU 2
+
+  btrfs_finish_ordered_io()
+    insert_reserved_file_extent()
+      __btrfs_drop_extents()
+         Searches for the key
+          (257 EXTENT_DATA 4096) through
+          btrfs_lookup_file_extent()
+
+         Key not found and we get a path where
+         path->nodes[0] == leaf X and
+         path->slots[0] == N
+
+         Because path->slots[0] is >=
+         btrfs_header_nritems(leaf X), we call
+         btrfs_next_leaf()
+
+         btrfs_next_leaf() releases the path
+
+                                                  inserts key
+                                                  (257 INODE_REF 4096)
+                                                  at the end of leaf X,
+                                                  leaf X now has N + 1 keys,
+                                                  and the new key is at
+                                                  slot N
+
+         btrfs_next_leaf() searches for
+         key (257 INODE_REF 256), with
+         path->keep_locks set to 1,
+         because it was the last key it
+         saw in leaf X
+
+           finds it in leaf X again and
+           notices it's no longer the last
+           key of the leaf, so it returns 0
+           with path->nodes[0] == leaf X and
+           path->slots[0] == N (which is now
+           < btrfs_header_nritems(leaf X)),
+           pointing to the new key
+           (257 INODE_REF 4096)
+
+         __btrfs_drop_extents() casts the
+         item at path->nodes[0], slot
+         path->slots[0], to a struct
+         btrfs_file_extent_item - it does
+         not skip keys for the target
+         inode with a type less than
+         BTRFS_EXTENT_DATA_KEY
+         (BTRFS_INODE_REF_KEY < BTRFS_EXTENT_DATA_KEY)
+
+         sees a bogus value for the type
+         field triggering the WARN_ON in
+         the trace shown above, and sets
+         extent_end = search_start (4096)
+
+         does the if-then-else logic to
+         fixup 0 length extent items created
+         by a past bug from hole punching:
+
+           if (extent_end == key.offset &&
+               extent_end >= search_start)
+               goto delete_extent_item;
+
+         that evaluates to true and it ends
+         up deleting the key pointed to by
+         path->slots[0], (257 INODE_REF 4096),
+         from leaf X
+
+The same could happen for example for a xattr that ends up having a key
+with an offset value that matches search_start (very unlikely but not
+impossible).
+
+So fix this by ensuring that keys smaller than BTRFS_EXTENT_DATA_KEY are
+skipped, never casted to struct btrfs_file_extent_item and never deleted
+by accident. Also protect against the unexpected case of getting a key
+for a lower inode number by skipping that key and issuing a warning.
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/file.c |   16 ++++++++++++----
+ 1 file changed, 12 insertions(+), 4 deletions(-)
+
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -756,8 +756,16 @@ next_slot:
+               }
+               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+-              if (key.objectid > ino ||
+-                  key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
++
++              if (key.objectid > ino)
++                      break;
++              if (WARN_ON_ONCE(key.objectid < ino) ||
++                  key.type < BTRFS_EXTENT_DATA_KEY) {
++                      ASSERT(del_nr == 0);
++                      path->slots[0]++;
++                      goto next_slot;
++              }
++              if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
+                       break;
+               fi = btrfs_item_ptr(leaf, path->slots[0],
+@@ -776,8 +784,8 @@ next_slot:
+                               btrfs_file_extent_inline_len(leaf,
+                                                    path->slots[0], fi);
+               } else {
+-                      WARN_ON(1);
+-                      extent_end = search_start;
++                      /* can't happen */
++                      BUG();
+               }
+               /*
diff --git a/queue-4.3/btrfs-fix-race-when-listing-an-inode-s-xattrs.patch b/queue-4.3/btrfs-fix-race-when-listing-an-inode-s-xattrs.patch
new file mode 100644 (file)
index 0000000..0a13530
--- /dev/null
@@ -0,0 +1,92 @@
+From f1cd1f0b7d1b5d4aaa5711e8f4e4898b0045cb6d Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 9 Nov 2015 18:06:38 +0000
+Subject: Btrfs: fix race when listing an inode's xattrs
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit f1cd1f0b7d1b5d4aaa5711e8f4e4898b0045cb6d upstream.
+
+When listing a inode's xattrs we have a time window where we race against
+a concurrent operation for adding a new hard link for our inode that makes
+us not return any xattr to user space. In order for this to happen, the
+first xattr of our inode needs to be at slot 0 of a leaf and the previous
+leaf must still have room for an inode ref (or extref) item, and this can
+happen because an inode's listxattrs callback does not lock the inode's
+i_mutex (nor does the VFS does it for us), but adding a hard link to an
+inode makes the VFS lock the inode's i_mutex before calling the inode's
+link callback.
+
+If we have the following leafs:
+
+               Leaf X (has N items)                    Leaf Y
+
+ [ ... (257 INODE_ITEM 0) (257 INODE_REF 256) ]  [ (257 XATTR_ITEM 12345), ... ]
+           slot N - 2         slot N - 1              slot 0
+
+The race illustrated by the following sequence diagram is possible:
+
+       CPU 1                                               CPU 2
+
+  btrfs_listxattr()
+
+    searches for key (257 XATTR_ITEM 0)
+
+    gets path with path->nodes[0] == leaf X
+    and path->slots[0] == N
+
+    because path->slots[0] is >=
+    btrfs_header_nritems(leaf X), it calls
+    btrfs_next_leaf()
+
+    btrfs_next_leaf()
+      releases the path
+
+                                                   adds key (257 INODE_REF 666)
+                                                   to the end of leaf X (slot N),
+                                                   and leaf X now has N + 1 items
+
+      searches for the key (257 INODE_REF 256),
+      with path->keep_locks == 1, because that
+      is the last key it saw in leaf X before
+      releasing the path
+
+      ends up at leaf X again and it verifies
+      that the key (257 INODE_REF 256) is no
+      longer the last key in leaf X, so it
+      returns with path->nodes[0] == leaf X
+      and path->slots[0] == N, pointing to
+      the new item with key (257 INODE_REF 666)
+
+    btrfs_listxattr's loop iteration sees that
+    the type of the key pointed by the path is
+    different from the type BTRFS_XATTR_ITEM_KEY
+    and so it breaks the loop and stops looking
+    for more xattr items
+      --> the application doesn't get any xattr
+          listed for our inode
+
+So fix this by breaking the loop only if the key's type is greater than
+BTRFS_XATTR_ITEM_KEY and skip the current key if its type is smaller.
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/xattr.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/xattr.c
++++ b/fs/btrfs/xattr.c
+@@ -313,8 +313,10 @@ ssize_t btrfs_listxattr(struct dentry *d
+               /* check to make sure this item is what we want */
+               if (found_key.objectid != key.objectid)
+                       break;
+-              if (found_key.type != BTRFS_XATTR_ITEM_KEY)
++              if (found_key.type > BTRFS_XATTR_ITEM_KEY)
+                       break;
++              if (found_key.type < BTRFS_XATTR_ITEM_KEY)
++                      goto next;
+               di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+               if (verify_dir_item(root, leaf, di))
diff --git a/queue-4.3/btrfs-fix-regression-when-running-delayed-references.patch b/queue-4.3/btrfs-fix-regression-when-running-delayed-references.patch
new file mode 100644 (file)
index 0000000..a767b5c
--- /dev/null
@@ -0,0 +1,307 @@
+From 2c3cf7d5f6105bb957df125dfce61d4483b8742d Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Thu, 22 Oct 2015 09:47:34 +0100
+Subject: Btrfs: fix regression when running delayed references
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 2c3cf7d5f6105bb957df125dfce61d4483b8742d upstream.
+
+In the kernel 4.2 merge window we had a refactoring/rework of the delayed
+references implementation in order to fix certain problems with qgroups.
+However that rework introduced one more regression that leads to the
+following trace when running delayed references for metadata:
+
+[35908.064664] kernel BUG at fs/btrfs/extent-tree.c:1832!
+[35908.065201] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
+[35908.065201] Modules linked in: dm_flakey dm_mod btrfs crc32c_generic xor raid6_pq nfsd auth_rpcgss oid_registry nfs_acl nfs lockd grace fscache sunrpc loop fuse parport_pc psmouse i2
+[35908.065201] CPU: 14 PID: 15014 Comm: kworker/u32:9 Tainted: G        W       4.3.0-rc5-btrfs-next-17+ #1
+[35908.065201] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.1-0-g4adadbd-20150316_085822-nilsson.home.kraxel.org 04/01/2014
+[35908.065201] Workqueue: btrfs-extent-refs btrfs_extent_refs_helper [btrfs]
+[35908.065201] task: ffff880114b7d780 ti: ffff88010c4c8000 task.ti: ffff88010c4c8000
+[35908.065201] RIP: 0010:[<ffffffffa04928b5>]  [<ffffffffa04928b5>] insert_inline_extent_backref+0x52/0xb1 [btrfs]
+[35908.065201] RSP: 0018:ffff88010c4cbb08  EFLAGS: 00010293
+[35908.065201] RAX: 0000000000000000 RBX: ffff88008a661000 RCX: 0000000000000000
+[35908.065201] RDX: ffffffffa04dd58f RSI: 0000000000000001 RDI: 0000000000000000
+[35908.065201] RBP: ffff88010c4cbb40 R08: 0000000000001000 R09: ffff88010c4cb9f8
+[35908.065201] R10: 0000000000000000 R11: 000000000000002c R12: 0000000000000000
+[35908.065201] R13: ffff88020a74c578 R14: 0000000000000000 R15: 0000000000000000
+[35908.065201] FS:  0000000000000000(0000) GS:ffff88023edc0000(0000) knlGS:0000000000000000
+[35908.065201] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
+[35908.065201] CR2: 00000000015e8708 CR3: 0000000102185000 CR4: 00000000000006e0
+[35908.065201] Stack:
+[35908.065201]  ffff88010c4cbb18 0000000000000f37 ffff88020a74c578 ffff88015a408000
+[35908.065201]  ffff880154a44000 0000000000000000 0000000000000005 ffff88010c4cbbd8
+[35908.065201]  ffffffffa0492b9a 0000000000000005 0000000000000000 0000000000000000
+[35908.065201] Call Trace:
+[35908.065201]  [<ffffffffa0492b9a>] __btrfs_inc_extent_ref+0x8b/0x208 [btrfs]
+[35908.065201]  [<ffffffffa0497117>] ? __btrfs_run_delayed_refs+0x4d4/0xd33 [btrfs]
+[35908.065201]  [<ffffffffa049773d>] __btrfs_run_delayed_refs+0xafa/0xd33 [btrfs]
+[35908.065201]  [<ffffffffa04a976a>] ? join_transaction.isra.10+0x25/0x41f [btrfs]
+[35908.065201]  [<ffffffffa04a97ed>] ? join_transaction.isra.10+0xa8/0x41f [btrfs]
+[35908.065201]  [<ffffffffa049914d>] btrfs_run_delayed_refs+0x75/0x1dd [btrfs]
+[35908.065201]  [<ffffffffa04992f1>] delayed_ref_async_start+0x3c/0x7b [btrfs]
+[35908.065201]  [<ffffffffa04d4b4f>] normal_work_helper+0x14c/0x32a [btrfs]
+[35908.065201]  [<ffffffffa04d4e93>] btrfs_extent_refs_helper+0x12/0x14 [btrfs]
+[35908.065201]  [<ffffffff81063b23>] process_one_work+0x24a/0x4ac
+[35908.065201]  [<ffffffff81064285>] worker_thread+0x206/0x2c2
+[35908.065201]  [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
+[35908.065201]  [<ffffffff8106407f>] ? rescuer_thread+0x2cb/0x2cb
+[35908.065201]  [<ffffffff8106904d>] kthread+0xef/0xf7
+[35908.065201]  [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
+[35908.065201]  [<ffffffff8147d10f>] ret_from_fork+0x3f/0x70
+[35908.065201]  [<ffffffff81068f5e>] ? kthread_parkme+0x24/0x24
+[35908.065201] Code: 6a 01 41 56 41 54 ff 75 10 41 51 4d 89 c1 49 89 c8 48 8d 4d d0 e8 f6 f1 ff ff 48 83 c4 28 85 c0 75 2c 49 81 fc ff 00 00 00 77 02 <0f> 0b 4c 8b 45 30 8b 4d 28 45 31
+[35908.065201] RIP  [<ffffffffa04928b5>] insert_inline_extent_backref+0x52/0xb1 [btrfs]
+[35908.065201]  RSP <ffff88010c4cbb08>
+[35908.310885] ---[ end trace fe4299baf0666457 ]---
+
+This happens because the new delayed references code no longer merges
+delayed references that have different sequence values. The following
+steps are an example sequence leading to this issue:
+
+1) Transaction N starts, fs_info->tree_mod_seq has value 0;
+
+2) Extent buffer (btree node) A is allocated, delayed reference Ref1 for
+   bytenr A is created, with a value of 1 and a seq value of 0;
+
+3) fs_info->tree_mod_seq is incremented to 1;
+
+4) Extent buffer A is deleted through btrfs_del_items(), which calls
+   btrfs_del_leaf(), which in turn calls btrfs_free_tree_block(). The
+   later returns the metadata extent associated to extent buffer A to
+   the free space cache (the range is not pinned), because the extent
+   buffer was created in the current transaction (N) and writeback never
+   happened for the extent buffer (flag BTRFS_HEADER_FLAG_WRITTEN not set
+   in the extent buffer).
+   This creates the delayed reference Ref2 for bytenr A, with a value
+   of -1 and a seq value of 1;
+
+5) Delayed reference Ref2 is not merged with Ref1 when we create it,
+   because they have different sequence numbers (decided at
+   add_delayed_ref_tail_merge());
+
+6) fs_info->tree_mod_seq is incremented to 2;
+
+7) Some task attempts to allocate a new extent buffer (done at
+   extent-tree.c:find_free_extent()), but due to heavy fragmentation
+   and running low on metadata space the clustered allocation fails
+   and we fall back to unclustered allocation, which finds the
+   extent at offset A, so a new extent buffer at offset A is allocated.
+   This creates delayed reference Ref3 for bytenr A, with a value of 1
+   and a seq value of 2;
+
+8) Ref3 is not merged neither with Ref2 nor Ref1, again because they
+   all have different seq values;
+
+9) We start running the delayed references (__btrfs_run_delayed_refs());
+
+10) The delayed Ref1 is the first one being applied, which ends up
+    creating an inline extent backref in the extent tree;
+
+10) Next the delayed reference Ref3 is selected for execution, and not
+    Ref2, because select_delayed_ref() always gives a preference for
+    positive references (that have an action of BTRFS_ADD_DELAYED_REF);
+
+11) When running Ref3 we encounter alreay the inline extent backref
+    in the extent tree at insert_inline_extent_backref(), which makes
+    us hit the following BUG_ON:
+
+        BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
+
+    This is always true because owner corresponds to the level of the
+    extent buffer/btree node in the btree.
+
+For the scenario described above we hit the BUG_ON because we never merge
+references that have different seq values.
+
+We used to do the merging before the 4.2 kernel, more specifically, before
+the commmits:
+
+  c6fc24549960 ("btrfs: delayed-ref: Use list to replace the ref_root in ref_head.")
+  c43d160fcd5e ("btrfs: delayed-ref: Cleanup the unneeded functions.")
+
+This issue became more exposed after the following change that was added
+to 4.2 as well:
+
+  cffc3374e567 ("Btrfs: fix order by which delayed references are run")
+
+Which in turn fixed another regression by the two commits previously
+mentioned.
+
+So fix this by bringing back the delayed reference merge code, with the
+proper adaptations so that it operates against the new data structure
+(linked list vs old red black tree implementation).
+
+This issue was hit running fstest btrfs/063 in a loop. Several people have
+reported this issue in the mailing list when running on kernels 4.2+.
+
+Very special thanks to Stéphane Lesimple for helping debugging this issue
+and testing this fix on his multi terabyte filesystem (which took more
+than one day to balance alone, plus fsck, etc).
+
+Fixes: c6fc24549960 ("btrfs: delayed-ref: Use list to replace the ref_root in ref_head.")
+Reported-by: Peter Becker <floyd.net@gmail.com>
+Reported-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
+Tested-by: Stéphane Lesimple <stephane_btrfs@lesimple.fr>
+Reported-by: Malte Schröder <malte@tnxip.de>
+Reported-by: Derek Dongray <derek@valedon.co.uk>
+Reported-by: Erkki Seppala <flux-btrfs@inside.org>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/delayed-ref.c |  113 +++++++++++++++++++++++++++++++++++++++++++++++++
+ fs/btrfs/extent-tree.c |   14 ++++++
+ 2 files changed, 127 insertions(+)
+
+--- a/fs/btrfs/delayed-ref.c
++++ b/fs/btrfs/delayed-ref.c
+@@ -197,6 +197,119 @@ static inline void drop_delayed_ref(stru
+               trans->delayed_ref_updates--;
+ }
++static bool merge_ref(struct btrfs_trans_handle *trans,
++                    struct btrfs_delayed_ref_root *delayed_refs,
++                    struct btrfs_delayed_ref_head *head,
++                    struct btrfs_delayed_ref_node *ref,
++                    u64 seq)
++{
++      struct btrfs_delayed_ref_node *next;
++      bool done = false;
++
++      next = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
++                              list);
++      while (!done && &next->list != &head->ref_list) {
++              int mod;
++              struct btrfs_delayed_ref_node *next2;
++
++              next2 = list_next_entry(next, list);
++
++              if (next == ref)
++                      goto next;
++
++              if (seq && next->seq >= seq)
++                      goto next;
++
++              if (next->type != ref->type || next->no_quota != ref->no_quota)
++                      goto next;
++
++              if ((ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
++                   ref->type == BTRFS_SHARED_BLOCK_REF_KEY) &&
++                  comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref),
++                                 btrfs_delayed_node_to_tree_ref(next),
++                                 ref->type))
++                      goto next;
++              if ((ref->type == BTRFS_EXTENT_DATA_REF_KEY ||
++                   ref->type == BTRFS_SHARED_DATA_REF_KEY) &&
++                  comp_data_refs(btrfs_delayed_node_to_data_ref(ref),
++                                 btrfs_delayed_node_to_data_ref(next)))
++                      goto next;
++
++              if (ref->action == next->action) {
++                      mod = next->ref_mod;
++              } else {
++                      if (ref->ref_mod < next->ref_mod) {
++                              swap(ref, next);
++                              done = true;
++                      }
++                      mod = -next->ref_mod;
++              }
++
++              drop_delayed_ref(trans, delayed_refs, head, next);
++              ref->ref_mod += mod;
++              if (ref->ref_mod == 0) {
++                      drop_delayed_ref(trans, delayed_refs, head, ref);
++                      done = true;
++              } else {
++                      /*
++                       * Can't have multiples of the same ref on a tree block.
++                       */
++                      WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
++                              ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
++              }
++next:
++              next = next2;
++      }
++
++      return done;
++}
++
++void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
++                            struct btrfs_fs_info *fs_info,
++                            struct btrfs_delayed_ref_root *delayed_refs,
++                            struct btrfs_delayed_ref_head *head)
++{
++      struct btrfs_delayed_ref_node *ref;
++      u64 seq = 0;
++
++      assert_spin_locked(&head->lock);
++
++      if (list_empty(&head->ref_list))
++              return;
++
++      /* We don't have too many refs to merge for data. */
++      if (head->is_data)
++              return;
++
++      spin_lock(&fs_info->tree_mod_seq_lock);
++      if (!list_empty(&fs_info->tree_mod_seq_list)) {
++              struct seq_list *elem;
++
++              elem = list_first_entry(&fs_info->tree_mod_seq_list,
++                                      struct seq_list, list);
++              seq = elem->seq;
++      }
++      spin_unlock(&fs_info->tree_mod_seq_lock);
++
++      ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
++                             list);
++      while (&ref->list != &head->ref_list) {
++              if (seq && ref->seq >= seq)
++                      goto next;
++
++              if (merge_ref(trans, delayed_refs, head, ref, seq)) {
++                      if (list_empty(&head->ref_list))
++                              break;
++                      ref = list_first_entry(&head->ref_list,
++                                             struct btrfs_delayed_ref_node,
++                                             list);
++                      continue;
++              }
++next:
++              ref = list_next_entry(ref, list);
++      }
++}
++
+ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
+                           struct btrfs_delayed_ref_root *delayed_refs,
+                           u64 seq)
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -2433,7 +2433,21 @@ static noinline int __btrfs_run_delayed_
+                       }
+               }
++              /*
++               * We need to try and merge add/drops of the same ref since we
++               * can run into issues with relocate dropping the implicit ref
++               * and then it being added back again before the drop can
++               * finish.  If we merged anything we need to re-loop so we can
++               * get a good ref.
++               * Or we can get node references of the same type that weren't
++               * merged when created due to bumps in the tree mod seq, and
++               * we need to merge them to prevent adding an inline extent
++               * backref before dropping it (triggering a BUG_ON at
++               * insert_inline_extent_backref()).
++               */
+               spin_lock(&locked_ref->lock);
++              btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
++                                       locked_ref);
+               /*
+                * locked_ref is the head node, so we have to go one
diff --git a/queue-4.3/btrfs-fix-resending-received-snapshot-with-parent.patch b/queue-4.3/btrfs-fix-resending-received-snapshot-with-parent.patch
new file mode 100644 (file)
index 0000000..cc96d52
--- /dev/null
@@ -0,0 +1,76 @@
+From b96b1db039ebc584d03a9933b279e0d3e704c528 Mon Sep 17 00:00:00 2001
+From: Robin Ruede <r.ruede@gmail.com>
+Date: Wed, 30 Sep 2015 21:23:33 +0200
+Subject: btrfs: fix resending received snapshot with parent
+
+From: Robin Ruede <r.ruede@gmail.com>
+
+commit b96b1db039ebc584d03a9933b279e0d3e704c528 upstream.
+
+This fixes a regression introduced by 37b8d27d between v4.1 and v4.2.
+
+When a snapshot is received, its received_uuid is set to the original
+uuid of the subvolume. When that snapshot is then resent to a third
+filesystem, it's received_uuid is set to the second uuid
+instead of the original one. The same was true for the parent_uuid.
+This behaviour was partially changed in 37b8d27d, but in that patch
+only the parent_uuid was taken from the real original,
+not the uuid itself, causing the search for the parent to fail in
+the case below.
+
+This happens for example when trying to send a series of linked
+snapshots (e.g. created by snapper) from the backup file system back
+to the original one.
+
+The following commands reproduce the issue in v4.2.1
+(no error in 4.1.6)
+
+    # setup three test file systems
+    for i in 1 2 3; do
+           truncate -s 50M fs$i
+           mkfs.btrfs fs$i
+           mkdir $i
+           mount fs$i $i
+    done
+    echo "content" > 1/testfile
+    btrfs su snapshot -r 1/ 1/snap1
+    echo "changed content" > 1/testfile
+    btrfs su snapshot -r 1/ 1/snap2
+
+    # works fine:
+    btrfs send 1/snap1 | btrfs receive 2/
+    btrfs send -p 1/snap1 1/snap2 | btrfs receive 2/
+
+    # ERROR: could not find parent subvolume
+    btrfs send 2/snap1 | btrfs receive 3/
+    btrfs send -p 2/snap1 2/snap2 | btrfs receive 3/
+
+Signed-off-by: Robin Ruede <rruede+git@gmail.com>
+Fixes: 37b8d27de5d0 ("Btrfs: use received_uuid of parent during send")
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Tested-by: Ed Tomlinson <edt@aei.ca>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/send.c |   10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/send.c
++++ b/fs/btrfs/send.c
+@@ -2353,8 +2353,14 @@ static int send_subvol_begin(struct send
+       }
+       TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
+-      TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
+-                      sctx->send_root->root_item.uuid);
++
++      if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid))
++              TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
++                          sctx->send_root->root_item.received_uuid);
++      else
++              TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
++                          sctx->send_root->root_item.uuid);
++
+       TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
+                   le64_to_cpu(sctx->send_root->root_item.ctransid));
+       if (parent_root) {
diff --git a/queue-4.3/btrfs-fix-signed-overflows-in-btrfs_sync_file.patch b/queue-4.3/btrfs-fix-signed-overflows-in-btrfs_sync_file.patch
new file mode 100644 (file)
index 0000000..b4eb4f5
--- /dev/null
@@ -0,0 +1,66 @@
+From 9dcbeed4d7e11e1dcf5e55475de3754f0855d1c2 Mon Sep 17 00:00:00 2001
+From: David Sterba <dsterba@suse.com>
+Date: Mon, 9 Nov 2015 11:44:45 +0100
+Subject: btrfs: fix signed overflows in btrfs_sync_file
+
+From: David Sterba <dsterba@suse.com>
+
+commit 9dcbeed4d7e11e1dcf5e55475de3754f0855d1c2 upstream.
+
+The calculation of range length in btrfs_sync_file leads to signed
+overflow. This was caught by PaX gcc SIZE_OVERFLOW plugin.
+
+https://forums.grsecurity.net/viewtopic.php?f=1&t=4284
+
+The fsync call passes 0 and LLONG_MAX, the range length does not fit to
+loff_t and overflows, but the value is converted to u64 so it silently
+works as expected.
+
+The minimal fix is a typecast to u64, switching functions to take
+(start, end) instead of (start, len) would be more intrusive.
+
+Coccinelle script found that there's one more opencoded calculation of
+the length.
+
+<smpl>
+@@
+loff_t start, end;
+@@
+* end - start
+</smpl>
+
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Chris Mason <clm@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/file.c |   10 +++++++---
+ 1 file changed, 7 insertions(+), 3 deletions(-)
+
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -1876,8 +1876,13 @@ int btrfs_sync_file(struct file *file, l
+       struct btrfs_log_ctx ctx;
+       int ret = 0;
+       bool full_sync = 0;
+-      const u64 len = end - start + 1;
++      u64 len;
++      /*
++       * The range length can be represented by u64, we have to do the typecasts
++       * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync()
++       */
++      len = (u64)end - (u64)start + 1;
+       trace_btrfs_sync_file(file, datasync);
+       /*
+@@ -2065,8 +2070,7 @@ int btrfs_sync_file(struct file *file, l
+                       }
+               }
+               if (!full_sync) {
+-                      ret = btrfs_wait_ordered_range(inode, start,
+-                                                     end - start + 1);
++                      ret = btrfs_wait_ordered_range(inode, start, len);
+                       if (ret) {
+                               btrfs_end_transaction(trans, root);
+                               goto out;
diff --git a/queue-4.3/btrfs-fix-truncation-of-compressed-and-inlined-extents.patch b/queue-4.3/btrfs-fix-truncation-of-compressed-and-inlined-extents.patch
new file mode 100644 (file)
index 0000000..a1720ff
--- /dev/null
@@ -0,0 +1,288 @@
+From 0305cd5f7fca85dae392b9ba85b116896eb7c1c7 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Fri, 16 Oct 2015 12:34:25 +0100
+Subject: Btrfs: fix truncation of compressed and inlined extents
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 0305cd5f7fca85dae392b9ba85b116896eb7c1c7 upstream.
+
+When truncating a file to a smaller size which consists of an inline
+extent that is compressed, we did not discard (or made unusable) the
+data between the new file size and the old file size, wasting metadata
+space and allowing for the truncated data to be leaked and the data
+corruption/loss mentioned below.
+We were also not correctly decrementing the number of bytes used by the
+inode, we were setting it to zero, giving a wrong report for callers of
+the stat(2) syscall. The fsck tool also reported an error about a mismatch
+between the nbytes of the file versus the real space used by the file.
+
+Now because we weren't discarding the truncated region of the file, it
+was possible for a caller of the clone ioctl to actually read the data
+that was truncated, allowing for a security breach without requiring root
+access to the system, using only standard filesystem operations. The
+scenario is the following:
+
+   1) User A creates a file which consists of an inline and compressed
+      extent with a size of 2000 bytes - the file is not accessible to
+      any other users (no read, write or execution permission for anyone
+      else);
+
+   2) The user truncates the file to a size of 1000 bytes;
+
+   3) User A makes the file world readable;
+
+   4) User B creates a file consisting of an inline extent of 2000 bytes;
+
+   5) User B issues a clone operation from user A's file into its own
+      file (using a length argument of 0, clone the whole range);
+
+   6) User B now gets to see the 1000 bytes that user A truncated from
+      its file before it made its file world readbale. User B also lost
+      the bytes in the range [1000, 2000[ bytes from its own file, but
+      that might be ok if his/her intention was reading stale data from
+      user A that was never supposed to be public.
+
+Note that this contrasts with the case where we truncate a file from 2000
+bytes to 1000 bytes and then truncate it back from 1000 to 2000 bytes. In
+this case reading any byte from the range [1000, 2000[ will return a value
+of 0x00, instead of the original data.
+
+This problem exists since the clone ioctl was added and happens both with
+and without my recent data loss and file corruption fixes for the clone
+ioctl (patch "Btrfs: fix file corruption and data loss after cloning
+inline extents").
+
+So fix this by truncating the compressed inline extents as we do for the
+non-compressed case, which involves decompressing, if the data isn't already
+in the page cache, compressing the truncated version of the extent, writing
+the compressed content into the inline extent and then truncate it.
+
+The following test case for fstests reproduces the problem. In order for
+the test to pass both this fix and my previous fix for the clone ioctl
+that forbids cloning a smaller inline extent into a larger one,
+which is titled "Btrfs: fix file corruption and data loss after cloning
+inline extents", are needed. Without that other fix the test fails in a
+different way that does not leak the truncated data, instead part of
+destination file gets replaced with zeroes (because the destination file
+has a larger inline extent than the source).
+
+  seq=`basename $0`
+  seqres=$RESULT_DIR/$seq
+  echo "QA output created by $seq"
+  tmp=/tmp/$$
+  status=1     # failure is the default!
+  trap "_cleanup; exit \$status" 0 1 2 3 15
+
+  _cleanup()
+  {
+      rm -f $tmp.*
+  }
+
+  # get standard environment, filters and checks
+  . ./common/rc
+  . ./common/filter
+
+  # real QA test starts here
+  _need_to_be_root
+  _supported_fs btrfs
+  _supported_os Linux
+  _require_scratch
+  _require_cloner
+
+  rm -f $seqres.full
+
+  _scratch_mkfs >>$seqres.full 2>&1
+  _scratch_mount "-o compress"
+
+  # Create our test files. File foo is going to be the source of a clone operation
+  # and consists of a single inline extent with an uncompressed size of 512 bytes,
+  # while file bar consists of a single inline extent with an uncompressed size of
+  # 256 bytes. For our test's purpose, it's important that file bar has an inline
+  # extent with a size smaller than foo's inline extent.
+  $XFS_IO_PROG -f -c "pwrite -S 0xa1 0 128"   \
+          -c "pwrite -S 0x2a 128 384" \
+          $SCRATCH_MNT/foo | _filter_xfs_io
+  $XFS_IO_PROG -f -c "pwrite -S 0xbb 0 256" $SCRATCH_MNT/bar | _filter_xfs_io
+
+  # Now durably persist all metadata and data. We do this to make sure that we get
+  # on disk an inline extent with a size of 512 bytes for file foo.
+  sync
+
+  # Now truncate our file foo to a smaller size. Because it consists of a
+  # compressed and inline extent, btrfs did not shrink the inline extent to the
+  # new size (if the extent was not compressed, btrfs would shrink it to 128
+  # bytes), it only updates the inode's i_size to 128 bytes.
+  $XFS_IO_PROG -c "truncate 128" $SCRATCH_MNT/foo
+
+  # Now clone foo's inline extent into bar.
+  # This clone operation should fail with errno EOPNOTSUPP because the source
+  # file consists only of an inline extent and the file's size is smaller than
+  # the inline extent of the destination (128 bytes < 256 bytes). However the
+  # clone ioctl was not prepared to deal with a file that has a size smaller
+  # than the size of its inline extent (something that happens only for compressed
+  # inline extents), resulting in copying the full inline extent from the source
+  # file into the destination file.
+  #
+  # Note that btrfs' clone operation for inline extents consists of removing the
+  # inline extent from the destination inode and copy the inline extent from the
+  # source inode into the destination inode, meaning that if the destination
+  # inode's inline extent is larger (N bytes) than the source inode's inline
+  # extent (M bytes), some bytes (N - M bytes) will be lost from the destination
+  # file. Btrfs could copy the source inline extent's data into the destination's
+  # inline extent so that we would not lose any data, but that's currently not
+  # done due to the complexity that would be needed to deal with such cases
+  # (specially when one or both extents are compressed), returning EOPNOTSUPP, as
+  # it's normally not a very common case to clone very small files (only case
+  # where we get inline extents) and copying inline extents does not save any
+  # space (unlike for normal, non-inlined extents).
+  $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/foo $SCRATCH_MNT/bar
+
+  # Now because the above clone operation used to succeed, and due to foo's inline
+  # extent not being shinked by the truncate operation, our file bar got the whole
+  # inline extent copied from foo, making us lose the last 128 bytes from bar
+  # which got replaced by the bytes in range [128, 256[ from foo before foo was
+  # truncated - in other words, data loss from bar and being able to read old and
+  # stale data from foo that should not be possible to read anymore through normal
+  # filesystem operations. Contrast with the case where we truncate a file from a
+  # size N to a smaller size M, truncate it back to size N and then read the range
+  # [M, N[, we should always get the value 0x00 for all the bytes in that range.
+
+  # We expected the clone operation to fail with errno EOPNOTSUPP and therefore
+  # not modify our file's bar data/metadata. So its content should be 256 bytes
+  # long with all bytes having the value 0xbb.
+  #
+  # Without the btrfs bug fix, the clone operation succeeded and resulted in
+  # leaking truncated data from foo, the bytes that belonged to its range
+  # [128, 256[, and losing data from bar in that same range. So reading the
+  # file gave us the following content:
+  #
+  # 0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
+  # *
+  # 0000200 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a 2a
+  # *
+  # 0000400
+  echo "File bar's content after the clone operation:"
+  od -t x1 $SCRATCH_MNT/bar
+
+  # Also because the foo's inline extent was not shrunk by the truncate
+  # operation, btrfs' fsck, which is run by the fstests framework everytime a
+  # test completes, failed reporting the following error:
+  #
+  #  root 5 inode 257 errors 400, nbytes wrong
+
+  status=0
+  exit
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/inode.c |   82 +++++++++++++++++++++++++++++++++++++++++++++----------
+ 1 file changed, 68 insertions(+), 14 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -4217,6 +4217,47 @@ static int truncate_space_check(struct b
+ }
++static int truncate_inline_extent(struct inode *inode,
++                                struct btrfs_path *path,
++                                struct btrfs_key *found_key,
++                                const u64 item_end,
++                                const u64 new_size)
++{
++      struct extent_buffer *leaf = path->nodes[0];
++      int slot = path->slots[0];
++      struct btrfs_file_extent_item *fi;
++      u32 size = (u32)(new_size - found_key->offset);
++      struct btrfs_root *root = BTRFS_I(inode)->root;
++
++      fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
++
++      if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
++              loff_t offset = new_size;
++              loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE);
++
++              /*
++               * Zero out the remaining of the last page of our inline extent,
++               * instead of directly truncating our inline extent here - that
++               * would be much more complex (decompressing all the data, then
++               * compressing the truncated data, which might be bigger than
++               * the size of the inline extent, resize the extent, etc).
++               * We release the path because to get the page we might need to
++               * read the extent item from disk (data not in the page cache).
++               */
++              btrfs_release_path(path);
++              return btrfs_truncate_page(inode, offset, page_end - offset, 0);
++      }
++
++      btrfs_set_file_extent_ram_bytes(leaf, fi, size);
++      size = btrfs_file_extent_calc_inline_size(size);
++      btrfs_truncate_item(root, path, size, 1);
++
++      if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
++              inode_sub_bytes(inode, item_end + 1 - new_size);
++
++      return 0;
++}
++
+ /*
+  * this can truncate away extent items, csum items and directory items.
+  * It starts at a high offset and removes keys until it can't find
+@@ -4411,27 +4452,40 @@ search_again:
+                        * special encodings
+                        */
+                       if (!del_item &&
+-                          btrfs_file_extent_compression(leaf, fi) == 0 &&
+                           btrfs_file_extent_encryption(leaf, fi) == 0 &&
+                           btrfs_file_extent_other_encoding(leaf, fi) == 0) {
+-                              u32 size = new_size - found_key.offset;
+-
+-                              if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+-                                      inode_sub_bytes(inode, item_end + 1 -
+-                                                      new_size);
+                               /*
+-                               * update the ram bytes to properly reflect
+-                               * the new size of our item
++                               * Need to release path in order to truncate a
++                               * compressed extent. So delete any accumulated
++                               * extent items so far.
+                                */
+-                              btrfs_set_file_extent_ram_bytes(leaf, fi, size);
+-                              size =
+-                                  btrfs_file_extent_calc_inline_size(size);
+-                              btrfs_truncate_item(root, path, size, 1);
++                              if (btrfs_file_extent_compression(leaf, fi) !=
++                                  BTRFS_COMPRESS_NONE && pending_del_nr) {
++                                      err = btrfs_del_items(trans, root, path,
++                                                            pending_del_slot,
++                                                            pending_del_nr);
++                                      if (err) {
++                                              btrfs_abort_transaction(trans,
++                                                                      root,
++                                                                      err);
++                                              goto error;
++                                      }
++                                      pending_del_nr = 0;
++                              }
++
++                              err = truncate_inline_extent(inode, path,
++                                                           &found_key,
++                                                           item_end,
++                                                           new_size);
++                              if (err) {
++                                      btrfs_abort_transaction(trans,
++                                                              root, err);
++                                      goto error;
++                              }
+                       } else if (test_bit(BTRFS_ROOT_REF_COWS,
+                                           &root->state)) {
+-                              inode_sub_bytes(inode, item_end + 1 -
+-                                              found_key.offset);
++                              inode_sub_bytes(inode, item_end + 1 - new_size);
+                       }
+               }
+ delete:
diff --git a/queue-4.3/ceph-fix-message-length-computation.patch b/queue-4.3/ceph-fix-message-length-computation.patch
new file mode 100644 (file)
index 0000000..920bc62
--- /dev/null
@@ -0,0 +1,37 @@
+From 777d738a5e58ba3b6f3932ab1543ce93703f4873 Mon Sep 17 00:00:00 2001
+From: Arnd Bergmann <arnd@arndb.de>
+Date: Wed, 30 Sep 2015 15:04:42 +0200
+Subject: ceph: fix message length computation
+
+From: Arnd Bergmann <arnd@arndb.de>
+
+commit 777d738a5e58ba3b6f3932ab1543ce93703f4873 upstream.
+
+create_request_message() computes the maximum length of a message,
+but uses the wrong type for the time stamp: sizeof(struct timespec)
+may be 8 or 16 depending on the architecture, while sizeof(struct
+ceph_timespec) is always 8, and that is what gets put into the
+message.
+
+Found while auditing the uses of timespec for y2038 problems.
+
+Fixes: b8e69066d8af ("ceph: include time stamp in every MDS request")
+Signed-off-by: Arnd Bergmann <arnd@arndb.de>
+Signed-off-by: Yan, Zheng <zyan@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ceph/mds_client.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ceph/mds_client.c
++++ b/fs/ceph/mds_client.c
+@@ -1935,7 +1935,7 @@ static struct ceph_msg *create_request_m
+       len = sizeof(*head) +
+               pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
+-              sizeof(struct timespec);
++              sizeof(struct ceph_timespec);
+       /* calculate (max) length for cap releases */
+       len += sizeof(struct ceph_mds_request_release) *
diff --git a/queue-4.3/cobalt-fix-kconfig-dependency.patch b/queue-4.3/cobalt-fix-kconfig-dependency.patch
new file mode 100644 (file)
index 0000000..66bde08
--- /dev/null
@@ -0,0 +1,55 @@
+From fc88dd16a0e430f57458e6bd9b62a631c6ea53a1 Mon Sep 17 00:00:00 2001
+From: Hans Verkuil <hverkuil@xs4all.nl>
+Date: Mon, 21 Sep 2015 08:42:04 -0300
+Subject: [media] cobalt: fix Kconfig dependency
+
+From: Hans Verkuil <hverkuil@xs4all.nl>
+
+commit fc88dd16a0e430f57458e6bd9b62a631c6ea53a1 upstream.
+
+The cobalt driver should depend on VIDEO_V4L2_SUBDEV_API.
+
+This fixes this kbuild error:
+
+tree:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git master
+head:   99bc7215bc60f6cd414cf1b85cd9d52cc596cccb
+commit: 85756a069c55e0315ac5990806899cfb607b987f [media] cobalt: add new driver
+config: x86_64-randconfig-s0-09201514 (attached as .config)
+reproduce:
+  git checkout 85756a069c55e0315ac5990806899cfb607b987f
+  # save the attached .config to linux build tree
+  make ARCH=x86_64
+
+All error/warnings (new ones prefixed by >>):
+
+   drivers/media/i2c/adv7604.c: In function 'adv76xx_get_format':
+>> drivers/media/i2c/adv7604.c:1853:9: error: implicit declaration of function 'v4l2_subdev_get_try_format' [-Werror=implicit-function-declaration]
+      fmt = v4l2_subdev_get_try_format(sd, cfg, format->pad);
+            ^
+   drivers/media/i2c/adv7604.c:1853:7: warning: assignment makes pointer from integer without a cast [-Wint-conversion]
+      fmt = v4l2_subdev_get_try_format(sd, cfg, format->pad);
+          ^
+   drivers/media/i2c/adv7604.c: In function 'adv76xx_set_format':
+   drivers/media/i2c/adv7604.c:1882:7: warning: assignment makes pointer from integer without a cast [-Wint-conversion]
+      fmt = v4l2_subdev_get_try_format(sd, cfg, format->pad);
+          ^
+   cc1: some warnings being treated as errors
+
+Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
+Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/media/pci/cobalt/Kconfig |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/media/pci/cobalt/Kconfig
++++ b/drivers/media/pci/cobalt/Kconfig
+@@ -1,6 +1,6 @@
+ config VIDEO_COBALT
+       tristate "Cisco Cobalt support"
+-      depends on VIDEO_V4L2 && I2C && MEDIA_CONTROLLER
++      depends on VIDEO_V4L2 && I2C && VIDEO_V4L2_SUBDEV_API
+       depends on PCI_MSI && MTD_COMPLEX_MAPPINGS
+       depends on GPIOLIB || COMPILE_TEST
+       depends on SND
diff --git a/queue-4.3/debugfs-fix-refcount-imbalance-in-start_creating.patch b/queue-4.3/debugfs-fix-refcount-imbalance-in-start_creating.patch
new file mode 100644 (file)
index 0000000..424f7cd
--- /dev/null
@@ -0,0 +1,47 @@
+From 0ee9608c89e81a1ccee52ecb58a7ff040e2522d9 Mon Sep 17 00:00:00 2001
+From: Daniel Borkmann <daniel@iogearbox.net>
+Date: Thu, 5 Nov 2015 00:01:51 +0100
+Subject: debugfs: fix refcount imbalance in start_creating
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+commit 0ee9608c89e81a1ccee52ecb58a7ff040e2522d9 upstream.
+
+In debugfs' start_creating(), we pin the file system to safely access
+its root. When we failed to create a file, we unpin the file system via
+failed_creating() to release the mount count and eventually the reference
+of the vfsmount.
+
+However, when we run into an error during lookup_one_len() when still
+in start_creating(), we only release the parent's mutex but not so the
+reference on the mount. Looks like it was done in the past, but after
+splitting portions of __create_file() into start_creating() and
+end_creating() via 190afd81e4a5 ("debugfs: split the beginning and the
+end of __create_file() off"), this seemed missed. Noticed during code
+review.
+
+Fixes: 190afd81e4a5 ("debugfs: split the beginning and the end of __create_file() off")
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/debugfs/inode.c |    6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/fs/debugfs/inode.c
++++ b/fs/debugfs/inode.c
+@@ -271,8 +271,12 @@ static struct dentry *start_creating(con
+               dput(dentry);
+               dentry = ERR_PTR(-EEXIST);
+       }
+-      if (IS_ERR(dentry))
++
++      if (IS_ERR(dentry)) {
+               mutex_unlock(&d_inode(parent)->i_mutex);
++              simple_release_fs(&debugfs_mount, &debugfs_mount_count);
++      }
++
+       return dentry;
+ }
diff --git a/queue-4.3/ext4-crypto-fix-bugs-in-ext4_encrypted_zeroout.patch b/queue-4.3/ext4-crypto-fix-bugs-in-ext4_encrypted_zeroout.patch
new file mode 100644 (file)
index 0000000..09eb740
--- /dev/null
@@ -0,0 +1,90 @@
+From 36086d43f6575c081067de9855786a2fc91df77b Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Sat, 3 Oct 2015 10:49:29 -0400
+Subject: ext4 crypto: fix bugs in ext4_encrypted_zeroout()
+
+From: Theodore Ts'o <tytso@mit.edu>
+
+commit 36086d43f6575c081067de9855786a2fc91df77b upstream.
+
+Fix multiple bugs in ext4_encrypted_zeroout(), including one that
+could cause us to write an encrypted zero page to the wrong location
+on disk, potentially causing data and file system corruption.
+Fortunately, this tends to only show up in stress tests, but even with
+these fixes, we are seeing some test failures with generic/127 --- but
+these are now caused by data failures instead of metadata corruption.
+
+Since ext4_encrypted_zeroout() is only used for some optimizations to
+keep the extent tree from being too fragmented, and
+ext4_encrypted_zeroout() itself isn't all that optimized from a time
+or IOPS perspective, disable the extent tree optimization for
+encrypted inodes for now.  This prevents the data corruption issues
+reported by generic/127 until we can figure out what's going wrong.
+
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ext4/crypto.c  |   23 +++++++++++++++++++----
+ fs/ext4/extents.c |    3 +++
+ 2 files changed, 22 insertions(+), 4 deletions(-)
+
+--- a/fs/ext4/crypto.c
++++ b/fs/ext4/crypto.c
+@@ -410,7 +410,13 @@ int ext4_encrypted_zeroout(struct inode
+       ext4_lblk_t             lblk = ex->ee_block;
+       ext4_fsblk_t            pblk = ext4_ext_pblock(ex);
+       unsigned int            len = ext4_ext_get_actual_len(ex);
+-      int                     err = 0;
++      int                     ret, err = 0;
++
++#if 0
++      ext4_msg(inode->i_sb, KERN_CRIT,
++               "ext4_encrypted_zeroout ino %lu lblk %u len %u",
++               (unsigned long) inode->i_ino, lblk, len);
++#endif
+       BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE);
+@@ -436,17 +442,26 @@ int ext4_encrypted_zeroout(struct inode
+                       goto errout;
+               }
+               bio->bi_bdev = inode->i_sb->s_bdev;
+-              bio->bi_iter.bi_sector = pblk;
+-              err = bio_add_page(bio, ciphertext_page,
++              bio->bi_iter.bi_sector =
++                      pblk << (inode->i_sb->s_blocksize_bits - 9);
++              ret = bio_add_page(bio, ciphertext_page,
+                                  inode->i_sb->s_blocksize, 0);
+-              if (err) {
++              if (ret != inode->i_sb->s_blocksize) {
++                      /* should never happen! */
++                      ext4_msg(inode->i_sb, KERN_ERR,
++                               "bio_add_page failed: %d", ret);
++                      WARN_ON(1);
+                       bio_put(bio);
++                      err = -EIO;
+                       goto errout;
+               }
+               err = submit_bio_wait(WRITE, bio);
++              if ((err == 0) && bio->bi_error)
++                      err = -EIO;
+               bio_put(bio);
+               if (err)
+                       goto errout;
++              lblk++; pblk++;
+       }
+       err = 0;
+ errout:
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3558,6 +3558,9 @@ static int ext4_ext_convert_to_initializ
+               max_zeroout = sbi->s_extent_max_zeroout_kb >>
+                       (inode->i_sb->s_blocksize_bits - 10);
++      if (ext4_encrypted_inode(inode))
++              max_zeroout = 0;
++
+       /* If extent is less than s_max_zeroout_kb, zeroout directly */
+       if (max_zeroout && (ee_len <= max_zeroout)) {
+               err = ext4_ext_zeroout(inode, ex);
diff --git a/queue-4.3/ext4-crypto-fix-memory-leak-in-ext4_bio_write_page.patch b/queue-4.3/ext4-crypto-fix-memory-leak-in-ext4_bio_write_page.patch
new file mode 100644 (file)
index 0000000..0253759
--- /dev/null
@@ -0,0 +1,51 @@
+From 937d7b84dca58f2565715f2c8e52f14c3d65fb22 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Fri, 2 Oct 2015 23:54:58 -0400
+Subject: ext4 crypto: fix memory leak in ext4_bio_write_page()
+
+From: Theodore Ts'o <tytso@mit.edu>
+
+commit 937d7b84dca58f2565715f2c8e52f14c3d65fb22 upstream.
+
+There are times when ext4_bio_write_page() is called even though we
+don't actually need to do any I/O.  This happens when ext4_writepage()
+gets called by the jbd2 commit path when an inode needs to force its
+pages written out in order to provide data=ordered guarantees --- and
+a page is backed by an unwritten (e.g., uninitialized) block on disk,
+or if delayed allocation means the page's backing store hasn't been
+allocated yet.  In that case, we need to skip the call to
+ext4_encrypt_page(), since in addition to wasting CPU, it leads to a
+bounce page and an ext4 crypto context getting leaked.
+
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ext4/page-io.c |    5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/fs/ext4/page-io.c
++++ b/fs/ext4/page-io.c
+@@ -425,6 +425,7 @@ int ext4_bio_write_page(struct ext4_io_s
+       struct buffer_head *bh, *head;
+       int ret = 0;
+       int nr_submitted = 0;
++      int nr_to_submit = 0;
+       blocksize = 1 << inode->i_blkbits;
+@@ -477,11 +478,13 @@ int ext4_bio_write_page(struct ext4_io_s
+                       unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+               }
+               set_buffer_async_write(bh);
++              nr_to_submit++;
+       } while ((bh = bh->b_this_page) != head);
+       bh = head = page_buffers(page);
+-      if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
++      if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) &&
++          nr_to_submit) {
+               data_page = ext4_encrypt(inode, page);
+               if (IS_ERR(data_page)) {
+                       ret = PTR_ERR(data_page);
diff --git a/queue-4.3/ext4-crypto-replace-some-bug_on-s-with-error-checks.patch b/queue-4.3/ext4-crypto-replace-some-bug_on-s-with-error-checks.patch
new file mode 100644 (file)
index 0000000..2a2d308
--- /dev/null
@@ -0,0 +1,101 @@
+From 687c3c36e754a999a8263745b27965128db4fee5 Mon Sep 17 00:00:00 2001
+From: Theodore Ts'o <tytso@mit.edu>
+Date: Sat, 3 Oct 2015 10:49:27 -0400
+Subject: ext4 crypto: replace some BUG_ON()'s with error checks
+
+From: Theodore Ts'o <tytso@mit.edu>
+
+commit 687c3c36e754a999a8263745b27965128db4fee5 upstream.
+
+Buggy (or hostile) userspace should not be able to cause the kernel to
+crash.
+
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ext4/crypto.c        |    1 -
+ fs/ext4/crypto_fname.c  |    2 --
+ fs/ext4/crypto_key.c    |   16 +++++++++++++---
+ fs/ext4/crypto_policy.c |    3 ++-
+ 4 files changed, 15 insertions(+), 7 deletions(-)
+
+--- a/fs/ext4/crypto.c
++++ b/fs/ext4/crypto.c
+@@ -296,7 +296,6 @@ static int ext4_page_crypto(struct ext4_
+       else
+               res = crypto_ablkcipher_encrypt(req);
+       if (res == -EINPROGRESS || res == -EBUSY) {
+-              BUG_ON(req->base.data != &ecr);
+               wait_for_completion(&ecr.completion);
+               res = ecr.res;
+       }
+--- a/fs/ext4/crypto_fname.c
++++ b/fs/ext4/crypto_fname.c
+@@ -120,7 +120,6 @@ static int ext4_fname_encrypt(struct ino
+       ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
+       res = crypto_ablkcipher_encrypt(req);
+       if (res == -EINPROGRESS || res == -EBUSY) {
+-              BUG_ON(req->base.data != &ecr);
+               wait_for_completion(&ecr.completion);
+               res = ecr.res;
+       }
+@@ -182,7 +181,6 @@ static int ext4_fname_decrypt(struct ino
+       ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
+       res = crypto_ablkcipher_decrypt(req);
+       if (res == -EINPROGRESS || res == -EBUSY) {
+-              BUG_ON(req->base.data != &ecr);
+               wait_for_completion(&ecr.completion);
+               res = ecr.res;
+       }
+--- a/fs/ext4/crypto_key.c
++++ b/fs/ext4/crypto_key.c
+@@ -71,7 +71,6 @@ static int ext4_derive_key_aes(char deri
+                                    EXT4_AES_256_XTS_KEY_SIZE, NULL);
+       res = crypto_ablkcipher_encrypt(req);
+       if (res == -EINPROGRESS || res == -EBUSY) {
+-              BUG_ON(req->base.data != &ecr);
+               wait_for_completion(&ecr.completion);
+               res = ecr.res;
+       }
+@@ -208,7 +207,12 @@ retry:
+               goto out;
+       }
+       crypt_info->ci_keyring_key = keyring_key;
+-      BUG_ON(keyring_key->type != &key_type_logon);
++      if (keyring_key->type != &key_type_logon) {
++              printk_once(KERN_WARNING
++                          "ext4: key type must be logon\n");
++              res = -ENOKEY;
++              goto out;
++      }
+       ukp = ((struct user_key_payload *)keyring_key->payload.data);
+       if (ukp->datalen != sizeof(struct ext4_encryption_key)) {
+               res = -EINVAL;
+@@ -217,7 +221,13 @@ retry:
+       master_key = (struct ext4_encryption_key *)ukp->data;
+       BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE !=
+                    EXT4_KEY_DERIVATION_NONCE_SIZE);
+-      BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE);
++      if (master_key->size != EXT4_AES_256_XTS_KEY_SIZE) {
++              printk_once(KERN_WARNING
++                          "ext4: key size incorrect: %d\n",
++                          master_key->size);
++              res = -ENOKEY;
++              goto out;
++      }
+       res = ext4_derive_key_aes(ctx.nonce, master_key->raw,
+                                 raw_key);
+       if (res)
+--- a/fs/ext4/crypto_policy.c
++++ b/fs/ext4/crypto_policy.c
+@@ -150,7 +150,8 @@ int ext4_is_child_context_consistent_wit
+       if ((parent == NULL) || (child == NULL)) {
+               pr_err("parent %p child %p\n", parent, child);
+-              BUG_ON(1);
++              WARN_ON(1);     /* Should never happen */
++              return 0;
+       }
+       /* no restrictions if the parent directory is not encrypted */
+       if (!ext4_encrypted_inode(parent))
diff --git a/queue-4.3/ext4-fix-potential-use-after-free-in-__ext4_journal_stop.patch b/queue-4.3/ext4-fix-potential-use-after-free-in-__ext4_journal_stop.patch
new file mode 100644 (file)
index 0000000..a427ddc
--- /dev/null
@@ -0,0 +1,44 @@
+From 6934da9238da947628be83635e365df41064b09b Mon Sep 17 00:00:00 2001
+From: Lukas Czerner <lczerner@redhat.com>
+Date: Sat, 17 Oct 2015 22:57:06 -0400
+Subject: ext4: fix potential use after free in __ext4_journal_stop
+
+From: Lukas Czerner <lczerner@redhat.com>
+
+commit 6934da9238da947628be83635e365df41064b09b upstream.
+
+There is a use-after-free possibility in __ext4_journal_stop() in the
+case that we free the handle in the first jbd2_journal_stop() because
+we're referencing handle->h_err afterwards. This was introduced in
+9705acd63b125dee8b15c705216d7186daea4625 and it is wrong. Fix it by
+storing the handle->h_err value beforehand and avoid referencing
+potentially freed handle.
+
+Fixes: 9705acd63b125dee8b15c705216d7186daea4625
+Signed-off-by: Lukas Czerner <lczerner@redhat.com>
+Reviewed-by: Andreas Dilger <adilger@dilger.ca>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ext4/ext4_jbd2.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/fs/ext4/ext4_jbd2.c
++++ b/fs/ext4/ext4_jbd2.c
+@@ -88,13 +88,13 @@ int __ext4_journal_stop(const char *wher
+               return 0;
+       }
++      err = handle->h_err;
+       if (!handle->h_transaction) {
+-              err = jbd2_journal_stop(handle);
+-              return handle->h_err ? handle->h_err : err;
++              rc = jbd2_journal_stop(handle);
++              return err ? err : rc;
+       }
+       sb = handle->h_transaction->t_journal->j_private;
+-      err = handle->h_err;
+       rc = jbd2_journal_stop(handle);
+       if (!err)
diff --git a/queue-4.3/ext4-jbd2-ensure-entering-into-panic-after-recording-an-error-in-superblock.patch b/queue-4.3/ext4-jbd2-ensure-entering-into-panic-after-recording-an-error-in-superblock.patch
new file mode 100644 (file)
index 0000000..d14582e
--- /dev/null
@@ -0,0 +1,104 @@
+From 4327ba52afd03fc4b5afa0ee1d774c9c5b0e85c5 Mon Sep 17 00:00:00 2001
+From: Daeho Jeong <daeho.jeong@samsung.com>
+Date: Sun, 18 Oct 2015 17:02:56 -0400
+Subject: ext4, jbd2: ensure entering into panic after recording an error in superblock
+
+From: Daeho Jeong <daeho.jeong@samsung.com>
+
+commit 4327ba52afd03fc4b5afa0ee1d774c9c5b0e85c5 upstream.
+
+If a EXT4 filesystem utilizes JBD2 journaling and an error occurs, the
+journaling will be aborted first and the error number will be recorded
+into JBD2 superblock and, finally, the system will enter into the
+panic state in "errors=panic" option.  But, in the rare case, this
+sequence is little twisted like the below figure and it will happen
+that the system enters into panic state, which means the system reset
+in mobile environment, before completion of recording an error in the
+journal superblock. In this case, e2fsck cannot recognize that the
+filesystem failure occurred in the previous run and the corruption
+wouldn't be fixed.
+
+Task A                        Task B
+ext4_handle_error()
+-> jbd2_journal_abort()
+  -> __journal_abort_soft()
+    -> __jbd2_journal_abort_hard()
+    | -> journal->j_flags |= JBD2_ABORT;
+    |
+    |                         __ext4_abort()
+    |                         -> jbd2_journal_abort()
+    |                         | -> __journal_abort_soft()
+    |                         |   -> if (journal->j_flags & JBD2_ABORT)
+    |                         |           return;
+    |                         -> panic()
+    |
+    -> jbd2_journal_update_sb_errno()
+
+Tested-by: Hobin Woo <hobin.woo@samsung.com>
+Signed-off-by: Daeho Jeong <daeho.jeong@samsung.com>
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ext4/super.c      |   12 ++++++++++--
+ fs/jbd2/journal.c    |    6 +++++-
+ include/linux/jbd2.h |    1 +
+ 3 files changed, 16 insertions(+), 3 deletions(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -394,9 +394,13 @@ static void ext4_handle_error(struct sup
+               smp_wmb();
+               sb->s_flags |= MS_RDONLY;
+       }
+-      if (test_opt(sb, ERRORS_PANIC))
++      if (test_opt(sb, ERRORS_PANIC)) {
++              if (EXT4_SB(sb)->s_journal &&
++                !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
++                      return;
+               panic("EXT4-fs (device %s): panic forced after error\n",
+                       sb->s_id);
++      }
+ }
+ #define ext4_error_ratelimit(sb)                                      \
+@@ -585,8 +589,12 @@ void __ext4_abort(struct super_block *sb
+                       jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
+               save_error_info(sb, function, line);
+       }
+-      if (test_opt(sb, ERRORS_PANIC))
++      if (test_opt(sb, ERRORS_PANIC)) {
++              if (EXT4_SB(sb)->s_journal &&
++                !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
++                      return;
+               panic("EXT4-fs panic from previous error\n");
++      }
+ }
+ void __ext4_msg(struct super_block *sb,
+--- a/fs/jbd2/journal.c
++++ b/fs/jbd2/journal.c
+@@ -2071,8 +2071,12 @@ static void __journal_abort_soft (journa
+       __jbd2_journal_abort_hard(journal);
+-      if (errno)
++      if (errno) {
+               jbd2_journal_update_sb_errno(journal);
++              write_lock(&journal->j_state_lock);
++              journal->j_flags |= JBD2_REC_ERR;
++              write_unlock(&journal->j_state_lock);
++      }
+ }
+ /**
+--- a/include/linux/jbd2.h
++++ b/include/linux/jbd2.h
+@@ -1046,6 +1046,7 @@ struct journal_s
+ #define JBD2_ABORT_ON_SYNCDATA_ERR    0x040   /* Abort the journal on file
+                                                * data write error in ordered
+                                                * mode */
++#define JBD2_REC_ERR  0x080   /* The errno in the sb has been recorded */
+ /*
+  * Function declarations for the journaling transaction and buffer
diff --git a/queue-4.3/firewire-ohci-fix-jmicron-jmb38x-it-context-discovery.patch b/queue-4.3/firewire-ohci-fix-jmicron-jmb38x-it-context-discovery.patch
new file mode 100644 (file)
index 0000000..a034d06
--- /dev/null
@@ -0,0 +1,71 @@
+From 100ceb66d5c40cc0c7018e06a9474302470be73c Mon Sep 17 00:00:00 2001
+From: Stefan Richter <stefanr@s5r6.in-berlin.de>
+Date: Tue, 3 Nov 2015 01:46:21 +0100
+Subject: firewire: ohci: fix JMicron JMB38x IT context discovery
+
+From: Stefan Richter <stefanr@s5r6.in-berlin.de>
+
+commit 100ceb66d5c40cc0c7018e06a9474302470be73c upstream.
+
+Reported by Clifford and Craig for JMicron OHCI-1394 + SDHCI combo
+controllers:  Often or even most of the time, the controller is
+initialized with the message "added OHCI v1.10 device as card 0, 4 IR +
+0 IT contexts, quirks 0x10".  With 0 isochronous transmit DMA contexts
+(IT contexts), applications like audio output are impossible.
+
+However, OHCI-1394 demands that at least 4 IT contexts are implemented
+by the link layer controller, and indeed JMicron JMB38x do implement
+four of them.  Only their IsoXmitIntMask register is unreliable at early
+access.
+
+With my own JMB381 single function controller I found:
+  - I can reproduce the problem with a lower probability than Craig's.
+  - If I put a loop around the section which clears and reads
+    IsoXmitIntMask, then either the first or the second attempt will
+    return the correct initial mask of 0x0000000f.  I never encountered
+    a case of needing more than a second attempt.
+  - Consequently, if I put a dummy reg_read(...IsoXmitIntMaskSet)
+    before the first write, the subsequent read will return the correct
+    result.
+  - If I merely ignore a wrong read result and force the known real
+    result, later isochronous transmit DMA usage works just fine.
+
+So let's just fix this chip bug up by the latter method.  Tested with
+JMB381 on kernel 3.13 and 4.3.
+
+Since OHCI-1394 generally requires 4 IT contexts at a minium, this
+workaround is simply applied whenever the initial read of IsoXmitIntMask
+returns 0, regardless whether it's a JMicron chip or not.  I never heard
+of this issue together with any other chip though.
+
+I am not 100% sure that this fix works on the OHCI-1394 part of JMB380
+and JMB388 combo controllers exactly the same as on the JMB381 single-
+function controller, but so far I haven't had a chance to let an owner
+of a combo chip run a patched kernel.
+
+Strangely enough, IsoRecvIntMask is always reported correctly, even
+though it is probed right before IsoXmitIntMask.
+
+Reported-by: Clifford Dunn
+Reported-by: Craig Moore <craig.moore@qenos.com>
+Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/firewire/ohci.c |    5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/drivers/firewire/ohci.c
++++ b/drivers/firewire/ohci.c
+@@ -3675,6 +3675,11 @@ static int pci_probe(struct pci_dev *dev
+       reg_write(ohci, OHCI1394_IsoXmitIntMaskSet, ~0);
+       ohci->it_context_support = reg_read(ohci, OHCI1394_IsoXmitIntMaskSet);
++      /* JMicron JMB38x often shows 0 at first read, just ignore it */
++      if (!ohci->it_context_support) {
++              ohci_notice(ohci, "overriding IsoXmitIntMask\n");
++              ohci->it_context_support = 0xf;
++      }
+       reg_write(ohci, OHCI1394_IsoXmitIntMaskClear, ~0);
+       ohci->it_context_mask = ohci->it_context_support;
+       ohci->n_it = hweight32(ohci->it_context_mask);
diff --git a/queue-4.3/nfs-if-we-have-no-valid-attrs-then-don-t-declare-the-attribute-cache-valid.patch b/queue-4.3/nfs-if-we-have-no-valid-attrs-then-don-t-declare-the-attribute-cache-valid.patch
new file mode 100644 (file)
index 0000000..7a5d30a
--- /dev/null
@@ -0,0 +1,39 @@
+From c812012f9ca7cf89c9e1a1cd512e6c3b5be04b85 Mon Sep 17 00:00:00 2001
+From: Jeff Layton <jlayton@poochiereds.net>
+Date: Wed, 25 Nov 2015 13:50:11 -0500
+Subject: nfs: if we have no valid attrs, then don't declare the attribute cache valid
+
+From: Jeff Layton <jlayton@poochiereds.net>
+
+commit c812012f9ca7cf89c9e1a1cd512e6c3b5be04b85 upstream.
+
+If we pass in an empty nfs_fattr struct to nfs_update_inode, it will
+(correctly) not update any of the attributes, but it then clears the
+NFS_INO_INVALID_ATTR flag, which indicates that the attributes are
+up to date. Don't clear the flag if the fattr struct has no valid
+attrs to apply.
+
+Reviewed-by: Steve French <steve.french@primarydata.com>
+Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
+Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/nfs/inode.c |    6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/fs/nfs/inode.c
++++ b/fs/nfs/inode.c
+@@ -1824,7 +1824,11 @@ static int nfs_update_inode(struct inode
+               if ((long)fattr->gencount - (long)nfsi->attr_gencount > 0)
+                       nfsi->attr_gencount = fattr->gencount;
+       }
+-      invalid &= ~NFS_INO_INVALID_ATTR;
++
++      /* Don't declare attrcache up to date if there were no attrs! */
++      if (fattr->valid != 0)
++              invalid &= ~NFS_INO_INVALID_ATTR;
++
+       /* Don't invalidate the data if we were to blame */
+       if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
+                               || S_ISLNK(inode->i_mode)))
diff --git a/queue-4.3/nfs4-limit-callback-decoding-to-received-bytes.patch b/queue-4.3/nfs4-limit-callback-decoding-to-received-bytes.patch
new file mode 100644 (file)
index 0000000..fc46183
--- /dev/null
@@ -0,0 +1,97 @@
+From 38b7631fbe42e6e247e9fc9879f961b14a687e3b Mon Sep 17 00:00:00 2001
+From: Benjamin Coddington <bcodding@redhat.com>
+Date: Fri, 20 Nov 2015 09:55:30 -0500
+Subject: nfs4: limit callback decoding to received bytes
+
+From: Benjamin Coddington <bcodding@redhat.com>
+
+commit 38b7631fbe42e6e247e9fc9879f961b14a687e3b upstream.
+
+A truncated cb_compound request will cause the client to decode null or
+data from a previous callback for nfs4.1 backchannel case, or uninitialized
+data for the nfs4.0 case. This is because the path through
+svc_process_common() advances the request's iov_base and decrements iov_len
+without adjusting the overall xdr_buf's len field.  That causes
+xdr_init_decode() to set up the xdr_stream with an incorrect length in
+nfs4_callback_compound().
+
+Fixing this for the nfs4.1 backchannel case first requires setting the
+correct iov_len and page_len based on the length of received data in the
+same manner as the nfs4.0 case.
+
+Then the request's xdr_buf length can be adjusted for both cases based upon
+the remaining iov_len and page_len.
+
+Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
+Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/nfs/callback_xdr.c         |    7 +++++--
+ net/sunrpc/backchannel_rqst.c |    8 ++++++++
+ net/sunrpc/svc.c              |    1 +
+ 3 files changed, 14 insertions(+), 2 deletions(-)
+
+--- a/fs/nfs/callback_xdr.c
++++ b/fs/nfs/callback_xdr.c
+@@ -76,7 +76,8 @@ static __be32 *read_buf(struct xdr_strea
+       p = xdr_inline_decode(xdr, nbytes);
+       if (unlikely(p == NULL))
+-              printk(KERN_WARNING "NFS: NFSv4 callback reply buffer overflowed!\n");
++              printk(KERN_WARNING "NFS: NFSv4 callback reply buffer overflowed "
++                                                      "or truncated request.\n");
+       return p;
+ }
+@@ -892,6 +893,7 @@ static __be32 nfs4_callback_compound(str
+       struct cb_compound_hdr_arg hdr_arg = { 0 };
+       struct cb_compound_hdr_res hdr_res = { NULL };
+       struct xdr_stream xdr_in, xdr_out;
++      struct xdr_buf *rq_arg = &rqstp->rq_arg;
+       __be32 *p, status;
+       struct cb_process_state cps = {
+               .drc_status = 0,
+@@ -903,7 +905,8 @@ static __be32 nfs4_callback_compound(str
+       dprintk("%s: start\n", __func__);
+-      xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base);
++      rq_arg->len = rq_arg->head[0].iov_len + rq_arg->page_len;
++      xdr_init_decode(&xdr_in, rq_arg, rq_arg->head[0].iov_base);
+       p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len);
+       xdr_init_encode(&xdr_out, &rqstp->rq_res, p);
+--- a/net/sunrpc/backchannel_rqst.c
++++ b/net/sunrpc/backchannel_rqst.c
+@@ -333,12 +333,20 @@ void xprt_complete_bc_request(struct rpc
+ {
+       struct rpc_xprt *xprt = req->rq_xprt;
+       struct svc_serv *bc_serv = xprt->bc_serv;
++      struct xdr_buf *rq_rcv_buf = &req->rq_rcv_buf;
+       spin_lock(&xprt->bc_pa_lock);
+       list_del(&req->rq_bc_pa_list);
+       xprt_dec_alloc_count(xprt, 1);
+       spin_unlock(&xprt->bc_pa_lock);
++      if (copied <= rq_rcv_buf->head[0].iov_len) {
++              rq_rcv_buf->head[0].iov_len = copied;
++              rq_rcv_buf->page_len = 0;
++      } else {
++              rq_rcv_buf->page_len = copied - rq_rcv_buf->head[0].iov_len;
++      }
++
+       req->rq_private_buf.len = copied;
+       set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
+--- a/net/sunrpc/svc.c
++++ b/net/sunrpc/svc.c
+@@ -1363,6 +1363,7 @@ bc_svc_process(struct svc_serv *serv, st
+       memcpy(&rqstp->rq_addr, &req->rq_xprt->addr, rqstp->rq_addrlen);
+       memcpy(&rqstp->rq_arg, &req->rq_rcv_buf, sizeof(rqstp->rq_arg));
+       memcpy(&rqstp->rq_res, &req->rq_snd_buf, sizeof(rqstp->rq_res));
++      rqstp->rq_arg.len = req->rq_private_buf.len;
+       /* reset result send buffer "put" position */
+       resv->iov_len = 0;
diff --git a/queue-4.3/nfs4-resend-layoutget-when-there-is-a-race-that-changes-the-seqid.patch b/queue-4.3/nfs4-resend-layoutget-when-there-is-a-race-that-changes-the-seqid.patch
new file mode 100644 (file)
index 0000000..be896f1
--- /dev/null
@@ -0,0 +1,96 @@
+From 4f2e9dce0c6348a95eaa56ade9bab18572221088 Mon Sep 17 00:00:00 2001
+From: Jeff Layton <jlayton@poochiereds.net>
+Date: Wed, 25 Nov 2015 13:43:14 -0500
+Subject: nfs4: resend LAYOUTGET when there is a race that changes the seqid
+
+From: Jeff Layton <jlayton@poochiereds.net>
+
+commit 4f2e9dce0c6348a95eaa56ade9bab18572221088 upstream.
+
+pnfs_layout_process will check the returned layout stateid against what
+the kernel has in-core. If it turns out that the stateid we received is
+older, then we should resend the LAYOUTGET instead of falling back to
+MDS I/O.
+
+Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
+Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/nfs/pnfs.c |   56 +++++++++++++++++++++++++++++++-------------------------
+ 1 file changed, 31 insertions(+), 25 deletions(-)
+
+--- a/fs/nfs/pnfs.c
++++ b/fs/nfs/pnfs.c
+@@ -872,33 +872,38 @@ send_layoutget(struct pnfs_layout_hdr *l
+       dprintk("--> %s\n", __func__);
+-      lgp = kzalloc(sizeof(*lgp), gfp_flags);
+-      if (lgp == NULL)
+-              return NULL;
+-
+-      i_size = i_size_read(ino);
++      /*
++       * Synchronously retrieve layout information from server and
++       * store in lseg. If we race with a concurrent seqid morphing
++       * op, then re-send the LAYOUTGET.
++       */
++      do {
++              lgp = kzalloc(sizeof(*lgp), gfp_flags);
++              if (lgp == NULL)
++                      return NULL;
++
++              i_size = i_size_read(ino);
++
++              lgp->args.minlength = PAGE_CACHE_SIZE;
++              if (lgp->args.minlength > range->length)
++                      lgp->args.minlength = range->length;
++              if (range->iomode == IOMODE_READ) {
++                      if (range->offset >= i_size)
++                              lgp->args.minlength = 0;
++                      else if (i_size - range->offset < lgp->args.minlength)
++                              lgp->args.minlength = i_size - range->offset;
++              }
++              lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
++              lgp->args.range = *range;
++              lgp->args.type = server->pnfs_curr_ld->id;
++              lgp->args.inode = ino;
++              lgp->args.ctx = get_nfs_open_context(ctx);
++              lgp->gfp_flags = gfp_flags;
++              lgp->cred = lo->plh_lc_cred;
+-      lgp->args.minlength = PAGE_CACHE_SIZE;
+-      if (lgp->args.minlength > range->length)
+-              lgp->args.minlength = range->length;
+-      if (range->iomode == IOMODE_READ) {
+-              if (range->offset >= i_size)
+-                      lgp->args.minlength = 0;
+-              else if (i_size - range->offset < lgp->args.minlength)
+-                      lgp->args.minlength = i_size - range->offset;
+-      }
+-      lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
+-      lgp->args.range = *range;
+-      lgp->args.type = server->pnfs_curr_ld->id;
+-      lgp->args.inode = ino;
+-      lgp->args.ctx = get_nfs_open_context(ctx);
+-      lgp->gfp_flags = gfp_flags;
+-      lgp->cred = lo->plh_lc_cred;
++              lseg = nfs4_proc_layoutget(lgp, gfp_flags);
++      } while (lseg == ERR_PTR(-EAGAIN));
+-      /* Synchronously retrieve layout information from server and
+-       * store in lseg.
+-       */
+-      lseg = nfs4_proc_layoutget(lgp, gfp_flags);
+       if (IS_ERR(lseg)) {
+               switch (PTR_ERR(lseg)) {
+               case -ENOMEM:
+@@ -1687,6 +1692,7 @@ pnfs_layout_process(struct nfs4_layoutge
+               /* existing state ID, make sure the sequence number matches. */
+               if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
+                       dprintk("%s forget reply due to sequence\n", __func__);
++                      status = -EAGAIN;
+                       goto out_forget_reply;
+               }
+               pnfs_set_layout_stateid(lo, &res->stateid, false);
diff --git a/queue-4.3/nfs4-start-callback_ident-at-idr-1.patch b/queue-4.3/nfs4-start-callback_ident-at-idr-1.patch
new file mode 100644 (file)
index 0000000..c9ef318
--- /dev/null
@@ -0,0 +1,33 @@
+From c68a027c05709330fe5b2f50c50d5fa02124b5d8 Mon Sep 17 00:00:00 2001
+From: Benjamin Coddington <bcodding@redhat.com>
+Date: Fri, 20 Nov 2015 09:56:20 -0500
+Subject: nfs4: start callback_ident at idr 1
+
+From: Benjamin Coddington <bcodding@redhat.com>
+
+commit c68a027c05709330fe5b2f50c50d5fa02124b5d8 upstream.
+
+If clp->cl_cb_ident is zero, then nfs_cb_idr_remove_locked() skips removing
+it when the nfs_client is freed.  A decoding or server bug can then find
+and try to put that first nfs_client which would lead to a crash.
+
+Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
+Fixes: d6870312659d ("nfs4client: convert to idr_alloc()")
+Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/nfs/nfs4client.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/nfs/nfs4client.c
++++ b/fs/nfs/nfs4client.c
+@@ -33,7 +33,7 @@ static int nfs_get_cb_ident_idr(struct n
+               return ret;
+       idr_preload(GFP_KERNEL);
+       spin_lock(&nn->nfs_client_lock);
+-      ret = idr_alloc(&nn->cb_ident_idr, clp, 0, 0, GFP_NOWAIT);
++      ret = idr_alloc(&nn->cb_ident_idr, clp, 1, 0, GFP_NOWAIT);
+       if (ret >= 0)
+               clp->cl_cb_ident = ret;
+       spin_unlock(&nn->nfs_client_lock);
diff --git a/queue-4.3/nfsd-eliminate-sending-duplicate-and-repeated-delegations.patch b/queue-4.3/nfsd-eliminate-sending-duplicate-and-repeated-delegations.patch
new file mode 100644 (file)
index 0000000..15f1bbd
--- /dev/null
@@ -0,0 +1,200 @@
+From 34ed9872e745fa56f10e9bef2cf3d2336c6c8816 Mon Sep 17 00:00:00 2001
+From: Andrew Elble <aweits@rit.edu>
+Date: Thu, 15 Oct 2015 12:07:28 -0400
+Subject: nfsd: eliminate sending duplicate and repeated delegations
+
+From: Andrew Elble <aweits@rit.edu>
+
+commit 34ed9872e745fa56f10e9bef2cf3d2336c6c8816 upstream.
+
+We've observed the nfsd server in a state where there are
+multiple delegations on the same nfs4_file for the same client.
+The nfs client does attempt to DELEGRETURN these when they are presented to
+it - but apparently under some (unknown) circumstances the client does not
+manage to return all of them. This leads to the eventual
+attempt to CB_RECALL more than one delegation with the same nfs
+filehandle to the same client. The first recall will succeed, but the
+next recall will fail with NFS4ERR_BADHANDLE. This leads to the server
+having delegations on cl_revoked that the client has no way to FREE
+or DELEGRETURN, with resulting inability to recover. The state manager
+on the server will continually assert SEQ4_STATUS_RECALLABLE_STATE_REVOKED,
+and the state manager on the client will be looping unable to satisfy
+the server.
+
+List discussion also reports a race between OPEN and DELEGRETURN that
+will be avoided by only sending the delegation once to the
+client. This is also logically in accordance with RFC5561 9.1.1 and 10.2.
+
+So, let's:
+
+1.) Not hand out duplicate delegations.
+2.) Only send them to the client once.
+
+RFC 5561:
+
+9.1.1:
+"Delegations and layouts, on the other hand, are not associated with a
+specific owner but are associated with the client as a whole
+(identified by a client ID)."
+
+10.2:
+"...the stateid for a delegation is associated with a client ID and may be
+used on behalf of all the open-owners for the given client.  A
+delegation is made to the client as a whole and not to any specific
+process or thread of control within it."
+
+Reported-by: Eric Meddaugh <etmsys@rit.edu>
+Cc: Trond Myklebust <trond.myklebust@primarydata.com>
+Cc: Olga Kornievskaia <aglo@umich.edu>
+Signed-off-by: Andrew Elble <aweits@rit.edu>
+Signed-off-by: J. Bruce Fields <bfields@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/nfsd/nfs4state.c |   94 ++++++++++++++++++++++++++++++++++++++++++++++------
+ 1 file changed, 84 insertions(+), 10 deletions(-)
+
+--- a/fs/nfsd/nfs4state.c
++++ b/fs/nfsd/nfs4state.c
+@@ -765,16 +765,68 @@ void nfs4_unhash_stid(struct nfs4_stid *
+       s->sc_type = 0;
+ }
+-static void
++/**
++ * nfs4_get_existing_delegation - Discover if this delegation already exists
++ * @clp:     a pointer to the nfs4_client we're granting a delegation to
++ * @fp:      a pointer to the nfs4_file we're granting a delegation on
++ *
++ * Return:
++ *      On success: NULL if an existing delegation was not found.
++ *
++ *      On error: -EAGAIN if one was previously granted to this nfs4_client
++ *                 for this nfs4_file.
++ *
++ */
++
++static int
++nfs4_get_existing_delegation(struct nfs4_client *clp, struct nfs4_file *fp)
++{
++      struct nfs4_delegation *searchdp = NULL;
++      struct nfs4_client *searchclp = NULL;
++
++      lockdep_assert_held(&state_lock);
++      lockdep_assert_held(&fp->fi_lock);
++
++      list_for_each_entry(searchdp, &fp->fi_delegations, dl_perfile) {
++              searchclp = searchdp->dl_stid.sc_client;
++              if (clp == searchclp) {
++                      return -EAGAIN;
++              }
++      }
++      return 0;
++}
++
++/**
++ * hash_delegation_locked - Add a delegation to the appropriate lists
++ * @dp:     a pointer to the nfs4_delegation we are adding.
++ * @fp:     a pointer to the nfs4_file we're granting a delegation on
++ *
++ * Return:
++ *      On success: NULL if the delegation was successfully hashed.
++ *
++ *      On error: -EAGAIN if one was previously granted to this
++ *                 nfs4_client for this nfs4_file. Delegation is not hashed.
++ *
++ */
++
++static int
+ hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
+ {
++      int status;
++      struct nfs4_client *clp = dp->dl_stid.sc_client;
++
+       lockdep_assert_held(&state_lock);
+       lockdep_assert_held(&fp->fi_lock);
++      status = nfs4_get_existing_delegation(clp, fp);
++      if (status)
++              return status;
++      ++fp->fi_delegees;
+       atomic_inc(&dp->dl_stid.sc_count);
+       dp->dl_stid.sc_type = NFS4_DELEG_STID;
+       list_add(&dp->dl_perfile, &fp->fi_delegations);
+-      list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
++      list_add(&dp->dl_perclnt, &clp->cl_delegations);
++      return 0;
+ }
+ static bool
+@@ -3946,6 +3998,18 @@ static struct file_lock *nfs4_alloc_init
+       return fl;
+ }
++/**
++ * nfs4_setlease - Obtain a delegation by requesting lease from vfs layer
++ * @dp:   a pointer to the nfs4_delegation we're adding.
++ *
++ * Return:
++ *      On success: Return code will be 0 on success.
++ *
++ *      On error: -EAGAIN if there was an existing delegation.
++ *                 nonzero if there is an error in other cases.
++ *
++ */
++
+ static int nfs4_setlease(struct nfs4_delegation *dp)
+ {
+       struct nfs4_file *fp = dp->dl_stid.sc_file;
+@@ -3977,16 +4041,19 @@ static int nfs4_setlease(struct nfs4_del
+               goto out_unlock;
+       /* Race breaker */
+       if (fp->fi_deleg_file) {
+-              status = 0;
+-              ++fp->fi_delegees;
+-              hash_delegation_locked(dp, fp);
++              status = hash_delegation_locked(dp, fp);
+               goto out_unlock;
+       }
+       fp->fi_deleg_file = filp;
+-      fp->fi_delegees = 1;
+-      hash_delegation_locked(dp, fp);
++      fp->fi_delegees = 0;
++      status = hash_delegation_locked(dp, fp);
+       spin_unlock(&fp->fi_lock);
+       spin_unlock(&state_lock);
++      if (status) {
++              /* Should never happen, this is a new fi_deleg_file  */
++              WARN_ON_ONCE(1);
++              goto out_fput;
++      }
+       return 0;
+ out_unlock:
+       spin_unlock(&fp->fi_lock);
+@@ -4006,6 +4073,15 @@ nfs4_set_delegation(struct nfs4_client *
+       if (fp->fi_had_conflict)
+               return ERR_PTR(-EAGAIN);
++      spin_lock(&state_lock);
++      spin_lock(&fp->fi_lock);
++      status = nfs4_get_existing_delegation(clp, fp);
++      spin_unlock(&fp->fi_lock);
++      spin_unlock(&state_lock);
++
++      if (status)
++              return ERR_PTR(status);
++
+       dp = alloc_init_deleg(clp, fh, odstate);
+       if (!dp)
+               return ERR_PTR(-ENOMEM);
+@@ -4024,9 +4100,7 @@ nfs4_set_delegation(struct nfs4_client *
+               status = -EAGAIN;
+               goto out_unlock;
+       }
+-      ++fp->fi_delegees;
+-      hash_delegation_locked(dp, fp);
+-      status = 0;
++      status = hash_delegation_locked(dp, fp);
+ out_unlock:
+       spin_unlock(&fp->fi_lock);
+       spin_unlock(&state_lock);
diff --git a/queue-4.3/nfsd-serialize-state-seqid-morphing-operations.patch b/queue-4.3/nfsd-serialize-state-seqid-morphing-operations.patch
new file mode 100644 (file)
index 0000000..810d919
--- /dev/null
@@ -0,0 +1,207 @@
+From 35a92fe8770ce54c5eb275cd76128645bea2d200 Mon Sep 17 00:00:00 2001
+From: Jeff Layton <jlayton@poochiereds.net>
+Date: Thu, 17 Sep 2015 07:47:08 -0400
+Subject: nfsd: serialize state seqid morphing operations
+
+From: Jeff Layton <jlayton@poochiereds.net>
+
+commit 35a92fe8770ce54c5eb275cd76128645bea2d200 upstream.
+
+Andrew was seeing a race occur when an OPEN and OPEN_DOWNGRADE were
+running in parallel. The server would receive the OPEN_DOWNGRADE first
+and check its seqid, but then an OPEN would race in and bump it. The
+OPEN_DOWNGRADE would then complete and bump the seqid again.  The result
+was that the OPEN_DOWNGRADE would be applied after the OPEN, even though
+it should have been rejected since the seqid changed.
+
+The only recourse we have here I think is to serialize operations that
+bump the seqid in a stateid, particularly when we're given a seqid in
+the call. To address this, we add a new rw_semaphore to the
+nfs4_ol_stateid struct. We do a down_write prior to checking the seqid
+after looking up the stateid to ensure that nothing else is going to
+bump it while we're operating on it.
+
+In the case of OPEN, we do a down_read, as the call doesn't contain a
+seqid. Those can run in parallel -- we just need to serialize them when
+there is a concurrent OPEN_DOWNGRADE or CLOSE.
+
+LOCK and LOCKU however always take the write lock as there is no
+opportunity for parallelizing those.
+
+Reported-and-Tested-by: Andrew W Elble <aweits@rit.edu>
+Signed-off-by: Jeff Layton <jeff.layton@primarydata.com>
+Signed-off-by: J. Bruce Fields <bfields@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/nfsd/nfs4state.c |   33 ++++++++++++++++++++++++++++-----
+ fs/nfsd/state.h     |   19 ++++++++++---------
+ 2 files changed, 38 insertions(+), 14 deletions(-)
+
+--- a/fs/nfsd/nfs4state.c
++++ b/fs/nfsd/nfs4state.c
+@@ -3360,6 +3360,7 @@ static void init_open_stateid(struct nfs
+       stp->st_access_bmap = 0;
+       stp->st_deny_bmap = 0;
+       stp->st_openstp = NULL;
++      init_rwsem(&stp->st_rwsem);
+       spin_lock(&oo->oo_owner.so_client->cl_lock);
+       list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
+       spin_lock(&fp->fi_lock);
+@@ -4187,15 +4188,20 @@ nfsd4_process_open2(struct svc_rqst *rqs
+        */
+       if (stp) {
+               /* Stateid was found, this is an OPEN upgrade */
++              down_read(&stp->st_rwsem);
+               status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
+-              if (status)
++              if (status) {
++                      up_read(&stp->st_rwsem);
+                       goto out;
++              }
+       } else {
+               stp = open->op_stp;
+               open->op_stp = NULL;
+               init_open_stateid(stp, fp, open);
++              down_read(&stp->st_rwsem);
+               status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
+               if (status) {
++                      up_read(&stp->st_rwsem);
+                       release_open_stateid(stp);
+                       goto out;
+               }
+@@ -4207,6 +4213,7 @@ nfsd4_process_open2(struct svc_rqst *rqs
+       }
+       update_stateid(&stp->st_stid.sc_stateid);
+       memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
++      up_read(&stp->st_rwsem);
+       if (nfsd4_has_session(&resp->cstate)) {
+               if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {
+@@ -4819,10 +4826,13 @@ static __be32 nfs4_seqid_op_checks(struc
+                * revoked delegations are kept only for free_stateid.
+                */
+               return nfserr_bad_stateid;
++      down_write(&stp->st_rwsem);
+       status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
+-      if (status)
+-              return status;
+-      return nfs4_check_fh(current_fh, &stp->st_stid);
++      if (status == nfs_ok)
++              status = nfs4_check_fh(current_fh, &stp->st_stid);
++      if (status != nfs_ok)
++              up_write(&stp->st_rwsem);
++      return status;
+ }
+ /* 
+@@ -4869,6 +4879,7 @@ static __be32 nfs4_preprocess_confirmed_
+               return status;
+       oo = openowner(stp->st_stateowner);
+       if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
++              up_write(&stp->st_rwsem);
+               nfs4_put_stid(&stp->st_stid);
+               return nfserr_bad_stateid;
+       }
+@@ -4899,11 +4910,14 @@ nfsd4_open_confirm(struct svc_rqst *rqst
+               goto out;
+       oo = openowner(stp->st_stateowner);
+       status = nfserr_bad_stateid;
+-      if (oo->oo_flags & NFS4_OO_CONFIRMED)
++      if (oo->oo_flags & NFS4_OO_CONFIRMED) {
++              up_write(&stp->st_rwsem);
+               goto put_stateid;
++      }
+       oo->oo_flags |= NFS4_OO_CONFIRMED;
+       update_stateid(&stp->st_stid.sc_stateid);
+       memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
++      up_write(&stp->st_rwsem);
+       dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
+               __func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
+@@ -4982,6 +4996,7 @@ nfsd4_open_downgrade(struct svc_rqst *rq
+       memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+       status = nfs_ok;
+ put_stateid:
++      up_write(&stp->st_rwsem);
+       nfs4_put_stid(&stp->st_stid);
+ out:
+       nfsd4_bump_seqid(cstate, status);
+@@ -5035,6 +5050,7 @@ nfsd4_close(struct svc_rqst *rqstp, stru
+               goto out; 
+       update_stateid(&stp->st_stid.sc_stateid);
+       memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
++      up_write(&stp->st_rwsem);
+       nfsd4_close_open_stateid(stp);
+@@ -5260,6 +5276,7 @@ init_lock_stateid(struct nfs4_ol_stateid
+       stp->st_access_bmap = 0;
+       stp->st_deny_bmap = open_stp->st_deny_bmap;
+       stp->st_openstp = open_stp;
++      init_rwsem(&stp->st_rwsem);
+       list_add(&stp->st_locks, &open_stp->st_locks);
+       list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
+       spin_lock(&fp->fi_lock);
+@@ -5428,6 +5445,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struc
+                                       &open_stp, nn);
+               if (status)
+                       goto out;
++              up_write(&open_stp->st_rwsem);
+               open_sop = openowner(open_stp->st_stateowner);
+               status = nfserr_bad_stateid;
+               if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
+@@ -5435,6 +5453,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struc
+                       goto out;
+               status = lookup_or_create_lock_state(cstate, open_stp, lock,
+                                                       &lock_stp, &new);
++              if (status == nfs_ok)
++                      down_write(&lock_stp->st_rwsem);
+       } else {
+               status = nfs4_preprocess_seqid_op(cstate,
+                                      lock->lk_old_lock_seqid,
+@@ -5540,6 +5560,8 @@ out:
+                   seqid_mutating_err(ntohl(status)))
+                       lock_sop->lo_owner.so_seqid++;
++              up_write(&lock_stp->st_rwsem);
++
+               /*
+                * If this is a new, never-before-used stateid, and we are
+                * returning an error, then just go ahead and release it.
+@@ -5709,6 +5731,7 @@ nfsd4_locku(struct svc_rqst *rqstp, stru
+ fput:
+       fput(filp);
+ put_stateid:
++      up_write(&stp->st_rwsem);
+       nfs4_put_stid(&stp->st_stid);
+ out:
+       nfsd4_bump_seqid(cstate, status);
+--- a/fs/nfsd/state.h
++++ b/fs/nfsd/state.h
+@@ -534,15 +534,16 @@ struct nfs4_file {
+  * Better suggestions welcome.
+  */
+ struct nfs4_ol_stateid {
+-      struct nfs4_stid    st_stid; /* must be first field */
+-      struct list_head              st_perfile;
+-      struct list_head              st_perstateowner;
+-      struct list_head              st_locks;
+-      struct nfs4_stateowner      * st_stateowner;
+-      struct nfs4_clnt_odstate    * st_clnt_odstate;
+-      unsigned char                 st_access_bmap;
+-      unsigned char                 st_deny_bmap;
+-      struct nfs4_ol_stateid         * st_openstp;
++      struct nfs4_stid                st_stid;
++      struct list_head                st_perfile;
++      struct list_head                st_perstateowner;
++      struct list_head                st_locks;
++      struct nfs4_stateowner          *st_stateowner;
++      struct nfs4_clnt_odstate        *st_clnt_odstate;
++      unsigned char                   st_access_bmap;
++      unsigned char                   st_deny_bmap;
++      struct nfs4_ol_stateid          *st_openstp;
++      struct rw_semaphore             st_rwsem;
+ };
+ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
diff --git a/queue-4.3/ocfs2-fix-umask-ignored-issue.patch b/queue-4.3/ocfs2-fix-umask-ignored-issue.patch
new file mode 100644 (file)
index 0000000..4ec1841
--- /dev/null
@@ -0,0 +1,36 @@
+From 8f1eb48758aacf6c1ffce18179295adbf3bd7640 Mon Sep 17 00:00:00 2001
+From: Junxiao Bi <junxiao.bi@oracle.com>
+Date: Fri, 20 Nov 2015 15:57:30 -0800
+Subject: ocfs2: fix umask ignored issue
+
+From: Junxiao Bi <junxiao.bi@oracle.com>
+
+commit 8f1eb48758aacf6c1ffce18179295adbf3bd7640 upstream.
+
+New created file's mode is not masked with umask, and this makes umask not
+work for ocfs2 volume.
+
+Fixes: 702e5bc ("ocfs2: use generic posix ACL infrastructure")
+Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
+Cc: Gang He <ghe@suse.com>
+Cc: Mark Fasheh <mfasheh@suse.de>
+Cc: Joel Becker <jlbec@evilplan.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/ocfs2/namei.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/ocfs2/namei.c
++++ b/fs/ocfs2/namei.c
+@@ -374,6 +374,8 @@ static int ocfs2_mknod(struct inode *dir
+               mlog_errno(status);
+               goto leave;
+       }
++      /* update inode->i_mode after mask with "umask". */
++      inode->i_mode = mode;
+       handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb,
+                                                           S_ISDIR(mode),
diff --git a/queue-4.3/rbd-don-t-put-snap_context-twice-in-rbd_queue_workfn.patch b/queue-4.3/rbd-don-t-put-snap_context-twice-in-rbd_queue_workfn.patch
new file mode 100644 (file)
index 0000000..8d5c1a3
--- /dev/null
@@ -0,0 +1,34 @@
+From 70b16db86f564977df074072143284aec2cb1162 Mon Sep 17 00:00:00 2001
+From: Ilya Dryomov <idryomov@gmail.com>
+Date: Fri, 27 Nov 2015 19:23:24 +0100
+Subject: rbd: don't put snap_context twice in rbd_queue_workfn()
+
+From: Ilya Dryomov <idryomov@gmail.com>
+
+commit 70b16db86f564977df074072143284aec2cb1162 upstream.
+
+Commit 4e752f0ab0e8 ("rbd: access snapshot context and mapping size
+safely") moved ceph_get_snap_context() out of rbd_img_request_create()
+and into rbd_queue_workfn(), adding a ceph_put_snap_context() to the
+error path in rbd_queue_workfn().  However, rbd_img_request_create()
+consumes a ref on snapc, so calling ceph_put_snap_context() after
+a successful rbd_img_request_create() leads to an extra put.  Fix it.
+
+Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
+Reviewed-by: Josh Durgin <jdurgin@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/block/rbd.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/block/rbd.c
++++ b/drivers/block/rbd.c
+@@ -3444,6 +3444,7 @@ static void rbd_queue_workfn(struct work
+               goto err_rq;
+       }
+       img_request->rq = rq;
++      snapc = NULL; /* img_request consumes a ref */
+       if (op_type == OBJ_OP_DISCARD)
+               result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA,
index f8b05d7fc78ce8db997421c89f08959b614a9275..4fc212dcd101c1060c98dd0628c8cd82ec3f20a1 100644 (file)
@@ -42,3 +42,32 @@ net-neighbour-fix-crash-at-dumping-device-agnostic-proxy-entries.patch
 ipv6-sctp-implement-sctp_v6_destroy_sock.patch
 openvswitch-fix-hangup-on-vxlan-gre-geneve-device-deletion.patch
 net_sched-fix-qdisc_tree_decrease_qlen-races.patch
+btrfs-fix-resending-received-snapshot-with-parent.patch
+btrfs-check-unsupported-filters-in-balance-arguments.patch
+btrfs-fix-file-corruption-and-data-loss-after-cloning-inline-extents.patch
+btrfs-fix-truncation-of-compressed-and-inlined-extents.patch
+btrfs-fix-regression-when-running-delayed-references.patch
+btrfs-fix-race-leading-to-incorrect-item-deletion-when-dropping-extents.patch
+btrfs-fix-race-leading-to-bug_on-when-running-delalloc-for-nodatacow.patch
+btrfs-fix-race-when-listing-an-inode-s-xattrs.patch
+btrfs-fix-signed-overflows-in-btrfs_sync_file.patch
+rbd-don-t-put-snap_context-twice-in-rbd_queue_workfn.patch
+ext4-crypto-fix-memory-leak-in-ext4_bio_write_page.patch
+ext4-crypto-replace-some-bug_on-s-with-error-checks.patch
+ext4-crypto-fix-bugs-in-ext4_encrypted_zeroout.patch
+ext4-fix-potential-use-after-free-in-__ext4_journal_stop.patch
+ext4-jbd2-ensure-entering-into-panic-after-recording-an-error-in-superblock.patch
+firewire-ohci-fix-jmicron-jmb38x-it-context-discovery.patch
+nfsd-serialize-state-seqid-morphing-operations.patch
+nfsd-eliminate-sending-duplicate-and-repeated-delegations.patch
+debugfs-fix-refcount-imbalance-in-start_creating.patch
+nfs4-limit-callback-decoding-to-received-bytes.patch
+nfs4-start-callback_ident-at-idr-1.patch
+nfs4-resend-layoutget-when-there-is-a-race-that-changes-the-seqid.patch
+nfs-if-we-have-no-valid-attrs-then-don-t-declare-the-attribute-cache-valid.patch
+ocfs2-fix-umask-ignored-issue.patch
+block-fix-segment-split.patch
+ceph-fix-message-length-computation.patch
+alsa-pci-depend-on-zone_dma.patch
+alsa-hda-hdmi-apply-skylake-fix-ups-to-broxton-display-codec.patch
+cobalt-fix-kconfig-dependency.patch