]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
3.14-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 3 Feb 2015 23:11:48 +0000 (15:11 -0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 3 Feb 2015 23:11:48 +0000 (15:11 -0800)
added patches:
rbd-fix-rbd_dev_parent_get-when-parent_overlap-0.patch
target-drop-arbitrary-maximum-i-o-size-limit.patch
workqueue-fix-subtle-pool-management-issue-which-can-stall-whole-worker_pool.patch

queue-3.14/rbd-fix-rbd_dev_parent_get-when-parent_overlap-0.patch [new file with mode: 0644]
queue-3.14/series
queue-3.14/target-drop-arbitrary-maximum-i-o-size-limit.patch [new file with mode: 0644]
queue-3.14/workqueue-fix-subtle-pool-management-issue-which-can-stall-whole-worker_pool.patch [new file with mode: 0644]

diff --git a/queue-3.14/rbd-fix-rbd_dev_parent_get-when-parent_overlap-0.patch b/queue-3.14/rbd-fix-rbd_dev_parent_get-when-parent_overlap-0.patch
new file mode 100644 (file)
index 0000000..32beda5
--- /dev/null
@@ -0,0 +1,94 @@
+From ae43e9d05eb4bd324155292f889fbd001c4faea8 Mon Sep 17 00:00:00 2001
+From: Ilya Dryomov <idryomov@redhat.com>
+Date: Mon, 19 Jan 2015 18:13:43 +0300
+Subject: rbd: fix rbd_dev_parent_get() when parent_overlap == 0
+
+From: Ilya Dryomov <idryomov@redhat.com>
+
+commit ae43e9d05eb4bd324155292f889fbd001c4faea8 upstream.
+
+The comment for rbd_dev_parent_get() said
+
+    * We must get the reference before checking for the overlap to
+    * coordinate properly with zeroing the parent overlap in
+    * rbd_dev_v2_parent_info() when an image gets flattened.  We
+    * drop it again if there is no overlap.
+
+but the "drop it again if there is no overlap" part was missing from
+the implementation.  This lead to absurd parent_ref values for images
+with parent_overlap == 0, as parent_ref was incremented for each
+img_request and virtually never decremented.
+
+Fix this by leveraging the fact that refresh path calls
+rbd_dev_v2_parent_info() under header_rwsem and use it for read in
+rbd_dev_parent_get(), instead of messing around with atomics.  Get rid
+of barriers in rbd_dev_v2_parent_info() while at it - I don't see what
+they'd pair with now and I suspect we are in a pretty miserable
+situation as far as proper locking goes regardless.
+
+Signed-off-by: Ilya Dryomov <idryomov@redhat.com>
+Reviewed-by: Josh Durgin <jdurgin@redhat.com>
+Reviewed-by: Alex Elder <elder@linaro.org>
+[idryomov@redhat.com: backport to 3.14: context]
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/block/rbd.c |   20 ++++++--------------
+ 1 file changed, 6 insertions(+), 14 deletions(-)
+
+--- a/drivers/block/rbd.c
++++ b/drivers/block/rbd.c
+@@ -1926,32 +1926,26 @@ static void rbd_dev_parent_put(struct rb
+  * If an image has a non-zero parent overlap, get a reference to its
+  * parent.
+  *
+- * We must get the reference before checking for the overlap to
+- * coordinate properly with zeroing the parent overlap in
+- * rbd_dev_v2_parent_info() when an image gets flattened.  We
+- * drop it again if there is no overlap.
+- *
+  * Returns true if the rbd device has a parent with a non-zero
+  * overlap and a reference for it was successfully taken, or
+  * false otherwise.
+  */
+ static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
+ {
+-      int counter;
++      int counter = 0;
+       if (!rbd_dev->parent_spec)
+               return false;
+-      counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
+-      if (counter > 0 && rbd_dev->parent_overlap)
+-              return true;
+-
+-      /* Image was flattened, but parent is not yet torn down */
++      down_read(&rbd_dev->header_rwsem);
++      if (rbd_dev->parent_overlap)
++              counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
++      up_read(&rbd_dev->header_rwsem);
+       if (counter < 0)
+               rbd_warn(rbd_dev, "parent reference overflow\n");
+-      return false;
++      return counter > 0;
+ }
+ /*
+@@ -3904,7 +3898,6 @@ static int rbd_dev_v2_parent_info(struct
+                */
+               if (rbd_dev->parent_overlap) {
+                       rbd_dev->parent_overlap = 0;
+-                      smp_mb();
+                       rbd_dev_parent_put(rbd_dev);
+                       pr_info("%s: clone image has been flattened\n",
+                               rbd_dev->disk->disk_name);
+@@ -3948,7 +3941,6 @@ static int rbd_dev_v2_parent_info(struct
+        * treat it specially.
+        */
+       rbd_dev->parent_overlap = overlap;
+-      smp_mb();
+       if (!overlap) {
+               /* A null parent_spec indicates it's the initial probe */
index 01394ccd4678b283b75caa0b9c0d2be06ddd7d96..55802529148af82b9bbaedb1fa8c0e22d10a960b 100644 (file)
@@ -28,3 +28,6 @@ arm-dma-ensure-that-old-section-mappings-are-flushed-from-the-tlb.patch
 pstore-clarify-clearing-of-_read_cnt-in-ramoops_context.patch
 pstore-skip-zero-size-persistent-ram-buffer-in-traverse.patch
 pstore-fix-null-pointer-fault-if-get-null-prz-in-ramoops_get_next_prz.patch
+rbd-fix-rbd_dev_parent_get-when-parent_overlap-0.patch
+workqueue-fix-subtle-pool-management-issue-which-can-stall-whole-worker_pool.patch
+target-drop-arbitrary-maximum-i-o-size-limit.patch
diff --git a/queue-3.14/target-drop-arbitrary-maximum-i-o-size-limit.patch b/queue-3.14/target-drop-arbitrary-maximum-i-o-size-limit.patch
new file mode 100644 (file)
index 0000000..badff3b
--- /dev/null
@@ -0,0 +1,159 @@
+From 046ba64285a4389ae5e9a7dfa253c6bff3d7c341 Mon Sep 17 00:00:00 2001
+From: Nicholas Bellinger <nab@linux-iscsi.org>
+Date: Tue, 6 Jan 2015 16:10:37 -0800
+Subject: target: Drop arbitrary maximum I/O size limit
+
+From: Nicholas Bellinger <nab@linux-iscsi.org>
+
+commit 046ba64285a4389ae5e9a7dfa253c6bff3d7c341 upstream.
+
+This patch drops the arbitrary maximum I/O size limit in sbc_parse_cdb(),
+which currently for fabric_max_sectors is hardcoded to 8192 (4 MB for 512
+byte sector devices), and for hw_max_sectors is a backend driver dependent
+value.
+
+This limit is problematic because Linux initiators have only recently
+started to honor block limits MAXIMUM TRANSFER LENGTH, and other non-Linux
+based initiators (eg: MSFT Fibre Channel) can also generate I/Os larger
+than 4 MB in size.
+
+Currently when this happens, the following message will appear on the
+target resulting in I/Os being returned with non recoverable status:
+
+  SCSI OP 28h with too big sectors 16384 exceeds fabric_max_sectors: 8192
+
+Instead, drop both [fabric,hw]_max_sector checks in sbc_parse_cdb(),
+and convert the existing hw_max_sectors into a purely informational
+attribute used to represent the granuality that backend driver and/or
+subsystem code is splitting I/Os upon.
+
+Also, update FILEIO with an explicit FD_MAX_BYTES check in fd_execute_rw()
+to deal with the one special iovec limitiation case.
+
+v2 changes:
+  - Drop hw_max_sectors check in sbc_parse_cdb()
+
+Reported-by: Lance Gropper <lance.gropper@qosserver.com>
+Reported-by: Stefan Priebe <s.priebe@profihost.ag>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Martin K. Petersen <martin.petersen@oracle.com>
+Cc: Roland Dreier <roland@purestorage.com>
+Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/target/target_core_device.c |    8 ++++----
+ drivers/target/target_core_file.c   |   11 ++++++++++-
+ drivers/target/target_core_iblock.c |    2 +-
+ drivers/target/target_core_sbc.c    |   15 ---------------
+ drivers/target/target_core_spc.c    |    5 +----
+ 5 files changed, 16 insertions(+), 25 deletions(-)
+
+--- a/drivers/target/target_core_device.c
++++ b/drivers/target/target_core_device.c
+@@ -1153,10 +1153,10 @@ int se_dev_set_optimal_sectors(struct se
+                               " changed for TCM/pSCSI\n", dev);
+               return -EINVAL;
+       }
+-      if (optimal_sectors > dev->dev_attrib.fabric_max_sectors) {
++      if (optimal_sectors > dev->dev_attrib.hw_max_sectors) {
+               pr_err("dev[%p]: Passed optimal_sectors %u cannot be"
+-                      " greater than fabric_max_sectors: %u\n", dev,
+-                      optimal_sectors, dev->dev_attrib.fabric_max_sectors);
++                      " greater than hw_max_sectors: %u\n", dev,
++                      optimal_sectors, dev->dev_attrib.hw_max_sectors);
+               return -EINVAL;
+       }
+@@ -1565,7 +1565,6 @@ struct se_device *target_alloc_device(st
+                               DA_UNMAP_GRANULARITY_ALIGNMENT_DEFAULT;
+       dev->dev_attrib.max_write_same_len = DA_MAX_WRITE_SAME_LEN;
+       dev->dev_attrib.fabric_max_sectors = DA_FABRIC_MAX_SECTORS;
+-      dev->dev_attrib.optimal_sectors = DA_FABRIC_MAX_SECTORS;
+       xcopy_lun = &dev->xcopy_lun;
+       xcopy_lun->lun_se_dev = dev;
+@@ -1606,6 +1605,7 @@ int target_configure_device(struct se_de
+       dev->dev_attrib.hw_max_sectors =
+               se_dev_align_max_sectors(dev->dev_attrib.hw_max_sectors,
+                                        dev->dev_attrib.hw_block_size);
++      dev->dev_attrib.optimal_sectors = dev->dev_attrib.hw_max_sectors;
+       dev->dev_index = scsi_get_new_index(SCSI_DEVICE_INDEX);
+       dev->creation_time = get_jiffies_64();
+--- a/drivers/target/target_core_file.c
++++ b/drivers/target/target_core_file.c
+@@ -620,7 +620,16 @@ fd_execute_rw(struct se_cmd *cmd, struct
+       struct fd_prot fd_prot;
+       sense_reason_t rc;
+       int ret = 0;
+-
++      /*
++       * We are currently limited by the number of iovecs (2048) per
++       * single vfs_[writev,readv] call.
++       */
++      if (cmd->data_length > FD_MAX_BYTES) {
++              pr_err("FILEIO: Not able to process I/O of %u bytes due to"
++                     "FD_MAX_BYTES: %u iovec count limitiation\n",
++                      cmd->data_length, FD_MAX_BYTES);
++              return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
++      }
+       /*
+        * Call vectorized fileio functions to map struct scatterlist
+        * physical memory addresses to struct iovec virtual memory.
+--- a/drivers/target/target_core_iblock.c
++++ b/drivers/target/target_core_iblock.c
+@@ -123,7 +123,7 @@ static int iblock_configure_device(struc
+       q = bdev_get_queue(bd);
+       dev->dev_attrib.hw_block_size = bdev_logical_block_size(bd);
+-      dev->dev_attrib.hw_max_sectors = UINT_MAX;
++      dev->dev_attrib.hw_max_sectors = queue_max_hw_sectors(q);
+       dev->dev_attrib.hw_queue_depth = q->nr_requests;
+       /*
+--- a/drivers/target/target_core_sbc.c
++++ b/drivers/target/target_core_sbc.c
+@@ -910,21 +910,6 @@ sbc_parse_cdb(struct se_cmd *cmd, struct
+       if (cmd->se_cmd_flags & SCF_SCSI_DATA_CDB) {
+               unsigned long long end_lba;
+-              if (sectors > dev->dev_attrib.fabric_max_sectors) {
+-                      printk_ratelimited(KERN_ERR "SCSI OP %02xh with too"
+-                              " big sectors %u exceeds fabric_max_sectors:"
+-                              " %u\n", cdb[0], sectors,
+-                              dev->dev_attrib.fabric_max_sectors);
+-                      return TCM_INVALID_CDB_FIELD;
+-              }
+-              if (sectors > dev->dev_attrib.hw_max_sectors) {
+-                      printk_ratelimited(KERN_ERR "SCSI OP %02xh with too"
+-                              " big sectors %u exceeds backend hw_max_sectors:"
+-                              " %u\n", cdb[0], sectors,
+-                              dev->dev_attrib.hw_max_sectors);
+-                      return TCM_INVALID_CDB_FIELD;
+-              }
+-
+               end_lba = dev->transport->get_blocks(dev) + 1;
+               if (cmd->t_task_lba + sectors > end_lba) {
+                       pr_err("cmd exceeds last lba %llu "
+--- a/drivers/target/target_core_spc.c
++++ b/drivers/target/target_core_spc.c
+@@ -503,7 +503,6 @@ static sense_reason_t
+ spc_emulate_evpd_b0(struct se_cmd *cmd, unsigned char *buf)
+ {
+       struct se_device *dev = cmd->se_dev;
+-      u32 max_sectors;
+       int have_tp = 0;
+       int opt, min;
+@@ -537,9 +536,7 @@ spc_emulate_evpd_b0(struct se_cmd *cmd,
+       /*
+        * Set MAXIMUM TRANSFER LENGTH
+        */
+-      max_sectors = min(dev->dev_attrib.fabric_max_sectors,
+-                        dev->dev_attrib.hw_max_sectors);
+-      put_unaligned_be32(max_sectors, &buf[8]);
++      put_unaligned_be32(dev->dev_attrib.hw_max_sectors, &buf[8]);
+       /*
+        * Set OPTIMAL TRANSFER LENGTH
diff --git a/queue-3.14/workqueue-fix-subtle-pool-management-issue-which-can-stall-whole-worker_pool.patch b/queue-3.14/workqueue-fix-subtle-pool-management-issue-which-can-stall-whole-worker_pool.patch
new file mode 100644 (file)
index 0000000..bdd72dc
--- /dev/null
@@ -0,0 +1,199 @@
+From 29187a9eeaf362d8422e62e17a22a6e115277a49 Mon Sep 17 00:00:00 2001
+From: Tejun Heo <tj@kernel.org>
+Date: Fri, 16 Jan 2015 14:21:16 -0500
+Subject: workqueue: fix subtle pool management issue which can stall whole worker_pool
+
+From: Tejun Heo <tj@kernel.org>
+
+commit 29187a9eeaf362d8422e62e17a22a6e115277a49 upstream.
+
+A worker_pool's forward progress is guaranteed by the fact that the
+last idle worker assumes the manager role to create more workers and
+summon the rescuers if creating workers doesn't succeed in timely
+manner before proceeding to execute work items.
+
+This manager role is implemented in manage_workers(), which indicates
+whether the worker may proceed to work item execution with its return
+value.  This is necessary because multiple workers may contend for the
+manager role, and, if there already is a manager, others should
+proceed to work item execution.
+
+Unfortunately, the function also indicates that the worker may proceed
+to work item execution if need_to_create_worker() is false at the head
+of the function.  need_to_create_worker() tests the following
+conditions.
+
+       pending work items && !nr_running && !nr_idle
+
+The first and third conditions are protected by pool->lock and thus
+won't change while holding pool->lock; however, nr_running can change
+asynchronously as other workers block and resume and while it's likely
+to be zero, as someone woke this worker up in the first place, some
+other workers could have become runnable inbetween making it non-zero.
+
+If this happens, manage_worker() could return false even with zero
+nr_idle making the worker, the last idle one, proceed to execute work
+items.  If then all workers of the pool end up blocking on a resource
+which can only be released by a work item which is pending on that
+pool, the whole pool can deadlock as there's no one to create more
+workers or summon the rescuers.
+
+This patch fixes the problem by removing the early exit condition from
+maybe_create_worker() and making manage_workers() return false iff
+there's already another manager, which ensures that the last worker
+doesn't start executing work items.
+
+We can leave the early exit condition alone and just ignore the return
+value but the only reason it was put there is because the
+manage_workers() used to perform both creations and destructions of
+workers and thus the function may be invoked while the pool is trying
+to reduce the number of workers.  Now that manage_workers() is called
+only when more workers are needed, the only case this early exit
+condition is triggered is rare race conditions rendering it pointless.
+
+Tested with simulated workload and modified workqueue code which
+trigger the pool deadlock reliably without this patch.
+
+tj: Updated to v3.14 where manage_workers() is responsible not only
+    for creating more workers but also destroying surplus ones.
+    maybe_create_worker() needs to keep its early exit condition to
+    avoid creating a new worker when manage_workers() is called to
+    destroy surplus ones.  Other than that, the adaptabion is
+    straight-forward.  Both maybe_{create|destroy}_worker() functions
+    are converted to return void and manage_workers() returns %false
+    iff it lost manager arbitration.
+
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Reported-by: Eric Sandeen <sandeen@sandeen.net>
+Link: http://lkml.kernel.org/g/54B019F4.8030009@sandeen.net
+Cc: Dave Chinner <david@fromorbit.com>
+Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ kernel/workqueue.c |   42 +++++++++++++-----------------------------
+ 1 file changed, 13 insertions(+), 29 deletions(-)
+
+--- a/kernel/workqueue.c
++++ b/kernel/workqueue.c
+@@ -1962,17 +1962,13 @@ static void pool_mayday_timeout(unsigned
+  * spin_lock_irq(pool->lock) which may be released and regrabbed
+  * multiple times.  Does GFP_KERNEL allocations.  Called only from
+  * manager.
+- *
+- * Return:
+- * %false if no action was taken and pool->lock stayed locked, %true
+- * otherwise.
+  */
+-static bool maybe_create_worker(struct worker_pool *pool)
++static void maybe_create_worker(struct worker_pool *pool)
+ __releases(&pool->lock)
+ __acquires(&pool->lock)
+ {
+       if (!need_to_create_worker(pool))
+-              return false;
++              return;
+ restart:
+       spin_unlock_irq(&pool->lock);
+@@ -1989,7 +1985,7 @@ restart:
+                       start_worker(worker);
+                       if (WARN_ON_ONCE(need_to_create_worker(pool)))
+                               goto restart;
+-                      return true;
++                      return;
+               }
+               if (!need_to_create_worker(pool))
+@@ -2006,7 +2002,7 @@ restart:
+       spin_lock_irq(&pool->lock);
+       if (need_to_create_worker(pool))
+               goto restart;
+-      return true;
++      return;
+ }
+ /**
+@@ -2019,15 +2015,9 @@ restart:
+  * LOCKING:
+  * spin_lock_irq(pool->lock) which may be released and regrabbed
+  * multiple times.  Called only from manager.
+- *
+- * Return:
+- * %false if no action was taken and pool->lock stayed locked, %true
+- * otherwise.
+  */
+-static bool maybe_destroy_workers(struct worker_pool *pool)
++static void maybe_destroy_workers(struct worker_pool *pool)
+ {
+-      bool ret = false;
+-
+       while (too_many_workers(pool)) {
+               struct worker *worker;
+               unsigned long expires;
+@@ -2041,10 +2031,7 @@ static bool maybe_destroy_workers(struct
+               }
+               destroy_worker(worker);
+-              ret = true;
+       }
+-
+-      return ret;
+ }
+ /**
+@@ -2064,16 +2051,14 @@ static bool maybe_destroy_workers(struct
+  * multiple times.  Does GFP_KERNEL allocations.
+  *
+  * Return:
+- * %false if the pool don't need management and the caller can safely start
+- * processing works, %true indicates that the function released pool->lock
+- * and reacquired it to perform some management function and that the
+- * conditions that the caller verified while holding the lock before
+- * calling the function might no longer be true.
++ * %false if the pool doesn't need management and the caller can safely
++ * start processing works, %true if management function was performed and
++ * the conditions that the caller verified before calling the function may
++ * no longer be true.
+  */
+ static bool manage_workers(struct worker *worker)
+ {
+       struct worker_pool *pool = worker->pool;
+-      bool ret = false;
+       /*
+        * Managership is governed by two mutexes - manager_arb and
+@@ -2097,7 +2082,7 @@ static bool manage_workers(struct worker
+        * manager_mutex.
+        */
+       if (!mutex_trylock(&pool->manager_arb))
+-              return ret;
++              return false;
+       /*
+        * With manager arbitration won, manager_mutex would be free in
+@@ -2107,7 +2092,6 @@ static bool manage_workers(struct worker
+               spin_unlock_irq(&pool->lock);
+               mutex_lock(&pool->manager_mutex);
+               spin_lock_irq(&pool->lock);
+-              ret = true;
+       }
+       pool->flags &= ~POOL_MANAGE_WORKERS;
+@@ -2116,12 +2100,12 @@ static bool manage_workers(struct worker
+        * Destroy and then create so that may_start_working() is true
+        * on return.
+        */
+-      ret |= maybe_destroy_workers(pool);
+-      ret |= maybe_create_worker(pool);
++      maybe_destroy_workers(pool);
++      maybe_create_worker(pool);
+       mutex_unlock(&pool->manager_mutex);
+       mutex_unlock(&pool->manager_arb);
+-      return ret;
++      return true;
+ }
+ /**