From 032f48ad7455b646e75581d2e21159648a5881fe Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 3 Feb 2015 15:11:48 -0800
Subject: [PATCH] 3.14-stable patches

added patches:
	rbd-fix-rbd_dev_parent_get-when-parent_overlap-0.patch
	target-drop-arbitrary-maximum-i-o-size-limit.patch
	workqueue-fix-subtle-pool-management-issue-which-can-stall-whole-worker_pool.patch
---
 ...dev_parent_get-when-parent_overlap-0.patch |  94 +++++++++
 queue-3.14/series                             |   3 +
 ...rop-arbitrary-maximum-i-o-size-limit.patch | 159 ++++++++++++++
 ...ue-which-can-stall-whole-worker_pool.patch | 199 ++++++++++++++++++
 4 files changed, 455 insertions(+)
 create mode 100644 queue-3.14/rbd-fix-rbd_dev_parent_get-when-parent_overlap-0.patch
 create mode 100644 queue-3.14/target-drop-arbitrary-maximum-i-o-size-limit.patch
 create mode 100644 queue-3.14/workqueue-fix-subtle-pool-management-issue-which-can-stall-whole-worker_pool.patch

diff --git a/queue-3.14/rbd-fix-rbd_dev_parent_get-when-parent_overlap-0.patch b/queue-3.14/rbd-fix-rbd_dev_parent_get-when-parent_overlap-0.patch
new file mode 100644
index 00000000000..32beda58f7e
--- /dev/null
+++ b/queue-3.14/rbd-fix-rbd_dev_parent_get-when-parent_overlap-0.patch
@@ -0,0 +1,94 @@
+From ae43e9d05eb4bd324155292f889fbd001c4faea8 Mon Sep 17 00:00:00 2001
+From: Ilya Dryomov <idryomov@redhat.com>
+Date: Mon, 19 Jan 2015 18:13:43 +0300
+Subject: rbd: fix rbd_dev_parent_get() when parent_overlap == 0
+
+From: Ilya Dryomov <idryomov@redhat.com>
+
+commit ae43e9d05eb4bd324155292f889fbd001c4faea8 upstream.
+
+The comment for rbd_dev_parent_get() said
+
+    * We must get the reference before checking for the overlap to
+    * coordinate properly with zeroing the parent overlap in
+    * rbd_dev_v2_parent_info() when an image gets flattened.  We
+    * drop it again if there is no overlap.
+
+but the "drop it again if there is no overlap" part was missing from
+the implementation.  This lead to absurd parent_ref values for images
+with parent_overlap == 0, as parent_ref was incremented for each
+img_request and virtually never decremented.
+
+Fix this by leveraging the fact that refresh path calls
+rbd_dev_v2_parent_info() under header_rwsem and use it for read in
+rbd_dev_parent_get(), instead of messing around with atomics.  Get rid
+of barriers in rbd_dev_v2_parent_info() while at it - I don't see what
+they'd pair with now and I suspect we are in a pretty miserable
+situation as far as proper locking goes regardless.
+
+Signed-off-by: Ilya Dryomov <idryomov@redhat.com>
+Reviewed-by: Josh Durgin <jdurgin@redhat.com>
+Reviewed-by: Alex Elder <elder@linaro.org>
+[idryomov@redhat.com: backport to 3.14: context]
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/block/rbd.c |   20 ++++++--------------
+ 1 file changed, 6 insertions(+), 14 deletions(-)
+
+--- a/drivers/block/rbd.c
++++ b/drivers/block/rbd.c
+@@ -1926,32 +1926,26 @@ static void rbd_dev_parent_put(struct rb
+  * If an image has a non-zero parent overlap, get a reference to its
+  * parent.
+  *
+- * We must get the reference before checking for the overlap to
+- * coordinate properly with zeroing the parent overlap in
+- * rbd_dev_v2_parent_info() when an image gets flattened.  We
+- * drop it again if there is no overlap.
+- *
+  * Returns true if the rbd device has a parent with a non-zero
+  * overlap and a reference for it was successfully taken, or
+  * false otherwise.
+  */
+ static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
+ {
+-	int counter;
++	int counter = 0;
+ 
+ 	if (!rbd_dev->parent_spec)
+ 		return false;
+ 
+-	counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
+-	if (counter > 0 && rbd_dev->parent_overlap)
+-		return true;
+-
+-	/* Image was flattened, but parent is not yet torn down */
++	down_read(&rbd_dev->header_rwsem);
++	if (rbd_dev->parent_overlap)
++		counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
++	up_read(&rbd_dev->header_rwsem);
+ 
+ 	if (counter < 0)
+ 		rbd_warn(rbd_dev, "parent reference overflow\n");
+ 
+-	return false;
++	return counter > 0;
+ }
+ 
+ /*
+@@ -3904,7 +3898,6 @@ static int rbd_dev_v2_parent_info(struct
+ 		 */
+ 		if (rbd_dev->parent_overlap) {
+ 			rbd_dev->parent_overlap = 0;
+-			smp_mb();
+ 			rbd_dev_parent_put(rbd_dev);
+ 			pr_info("%s: clone image has been flattened\n",
+ 				rbd_dev->disk->disk_name);
+@@ -3948,7 +3941,6 @@ static int rbd_dev_v2_parent_info(struct
+ 	 * treat it specially.
+ 	 */
+ 	rbd_dev->parent_overlap = overlap;
+-	smp_mb();
+ 	if (!overlap) {
+ 
+ 		/* A null parent_spec indicates it's the initial probe */
diff --git a/queue-3.14/series b/queue-3.14/series
index 01394ccd467..55802529148 100644
--- a/queue-3.14/series
+++ b/queue-3.14/series
@@ -28,3 +28,6 @@ arm-dma-ensure-that-old-section-mappings-are-flushed-from-the-tlb.patch
 pstore-clarify-clearing-of-_read_cnt-in-ramoops_context.patch
 pstore-skip-zero-size-persistent-ram-buffer-in-traverse.patch
 pstore-fix-null-pointer-fault-if-get-null-prz-in-ramoops_get_next_prz.patch
+rbd-fix-rbd_dev_parent_get-when-parent_overlap-0.patch
+workqueue-fix-subtle-pool-management-issue-which-can-stall-whole-worker_pool.patch
+target-drop-arbitrary-maximum-i-o-size-limit.patch
diff --git a/queue-3.14/target-drop-arbitrary-maximum-i-o-size-limit.patch b/queue-3.14/target-drop-arbitrary-maximum-i-o-size-limit.patch
new file mode 100644
index 00000000000..badff3bcd13
--- /dev/null
+++ b/queue-3.14/target-drop-arbitrary-maximum-i-o-size-limit.patch
@@ -0,0 +1,159 @@
+From 046ba64285a4389ae5e9a7dfa253c6bff3d7c341 Mon Sep 17 00:00:00 2001
+From: Nicholas Bellinger <nab@linux-iscsi.org>
+Date: Tue, 6 Jan 2015 16:10:37 -0800
+Subject: target: Drop arbitrary maximum I/O size limit
+
+From: Nicholas Bellinger <nab@linux-iscsi.org>
+
+commit 046ba64285a4389ae5e9a7dfa253c6bff3d7c341 upstream.
+
+This patch drops the arbitrary maximum I/O size limit in sbc_parse_cdb(),
+which currently for fabric_max_sectors is hardcoded to 8192 (4 MB for 512
+byte sector devices), and for hw_max_sectors is a backend driver dependent
+value.
+
+This limit is problematic because Linux initiators have only recently
+started to honor block limits MAXIMUM TRANSFER LENGTH, and other non-Linux
+based initiators (eg: MSFT Fibre Channel) can also generate I/Os larger
+than 4 MB in size.
+
+Currently when this happens, the following message will appear on the
+target resulting in I/Os being returned with non recoverable status:
+
+  SCSI OP 28h with too big sectors 16384 exceeds fabric_max_sectors: 8192
+
+Instead, drop both [fabric,hw]_max_sector checks in sbc_parse_cdb(),
+and convert the existing hw_max_sectors into a purely informational
+attribute used to represent the granuality that backend driver and/or
+subsystem code is splitting I/Os upon.
+
+Also, update FILEIO with an explicit FD_MAX_BYTES check in fd_execute_rw()
+to deal with the one special iovec limitiation case.
+
+v2 changes:
+  - Drop hw_max_sectors check in sbc_parse_cdb()
+
+Reported-by: Lance Gropper <lance.gropper@qosserver.com>
+Reported-by: Stefan Priebe <s.priebe@profihost.ag>
+Cc: Christoph Hellwig <hch@lst.de>
+Cc: Martin K. Petersen <martin.petersen@oracle.com>
+Cc: Roland Dreier <roland@purestorage.com>
+Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/target/target_core_device.c |    8 ++++----
+ drivers/target/target_core_file.c   |   11 ++++++++++-
+ drivers/target/target_core_iblock.c |    2 +-
+ drivers/target/target_core_sbc.c    |   15 ---------------
+ drivers/target/target_core_spc.c    |    5 +----
+ 5 files changed, 16 insertions(+), 25 deletions(-)
+
+--- a/drivers/target/target_core_device.c
++++ b/drivers/target/target_core_device.c
+@@ -1153,10 +1153,10 @@ int se_dev_set_optimal_sectors(struct se
+ 				" changed for TCM/pSCSI\n", dev);
+ 		return -EINVAL;
+ 	}
+-	if (optimal_sectors > dev->dev_attrib.fabric_max_sectors) {
++	if (optimal_sectors > dev->dev_attrib.hw_max_sectors) {
+ 		pr_err("dev[%p]: Passed optimal_sectors %u cannot be"
+-			" greater than fabric_max_sectors: %u\n", dev,
+-			optimal_sectors, dev->dev_attrib.fabric_max_sectors);
++			" greater than hw_max_sectors: %u\n", dev,
++			optimal_sectors, dev->dev_attrib.hw_max_sectors);
+ 		return -EINVAL;
+ 	}
+ 
+@@ -1565,7 +1565,6 @@ struct se_device *target_alloc_device(st
+ 				DA_UNMAP_GRANULARITY_ALIGNMENT_DEFAULT;
+ 	dev->dev_attrib.max_write_same_len = DA_MAX_WRITE_SAME_LEN;
+ 	dev->dev_attrib.fabric_max_sectors = DA_FABRIC_MAX_SECTORS;
+-	dev->dev_attrib.optimal_sectors = DA_FABRIC_MAX_SECTORS;
+ 
+ 	xcopy_lun = &dev->xcopy_lun;
+ 	xcopy_lun->lun_se_dev = dev;
+@@ -1606,6 +1605,7 @@ int target_configure_device(struct se_de
+ 	dev->dev_attrib.hw_max_sectors =
+ 		se_dev_align_max_sectors(dev->dev_attrib.hw_max_sectors,
+ 					 dev->dev_attrib.hw_block_size);
++	dev->dev_attrib.optimal_sectors = dev->dev_attrib.hw_max_sectors;
+ 
+ 	dev->dev_index = scsi_get_new_index(SCSI_DEVICE_INDEX);
+ 	dev->creation_time = get_jiffies_64();
+--- a/drivers/target/target_core_file.c
++++ b/drivers/target/target_core_file.c
+@@ -620,7 +620,16 @@ fd_execute_rw(struct se_cmd *cmd, struct
+ 	struct fd_prot fd_prot;
+ 	sense_reason_t rc;
+ 	int ret = 0;
+-
++	/*
++	 * We are currently limited by the number of iovecs (2048) per
++	 * single vfs_[writev,readv] call.
++	 */
++	if (cmd->data_length > FD_MAX_BYTES) {
++		pr_err("FILEIO: Not able to process I/O of %u bytes due to"
++		       "FD_MAX_BYTES: %u iovec count limitiation\n",
++			cmd->data_length, FD_MAX_BYTES);
++		return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
++	}
+ 	/*
+ 	 * Call vectorized fileio functions to map struct scatterlist
+ 	 * physical memory addresses to struct iovec virtual memory.
+--- a/drivers/target/target_core_iblock.c
++++ b/drivers/target/target_core_iblock.c
+@@ -123,7 +123,7 @@ static int iblock_configure_device(struc
+ 	q = bdev_get_queue(bd);
+ 
+ 	dev->dev_attrib.hw_block_size = bdev_logical_block_size(bd);
+-	dev->dev_attrib.hw_max_sectors = UINT_MAX;
++	dev->dev_attrib.hw_max_sectors = queue_max_hw_sectors(q);
+ 	dev->dev_attrib.hw_queue_depth = q->nr_requests;
+ 
+ 	/*
+--- a/drivers/target/target_core_sbc.c
++++ b/drivers/target/target_core_sbc.c
+@@ -910,21 +910,6 @@ sbc_parse_cdb(struct se_cmd *cmd, struct
+ 	if (cmd->se_cmd_flags & SCF_SCSI_DATA_CDB) {
+ 		unsigned long long end_lba;
+ 
+-		if (sectors > dev->dev_attrib.fabric_max_sectors) {
+-			printk_ratelimited(KERN_ERR "SCSI OP %02xh with too"
+-				" big sectors %u exceeds fabric_max_sectors:"
+-				" %u\n", cdb[0], sectors,
+-				dev->dev_attrib.fabric_max_sectors);
+-			return TCM_INVALID_CDB_FIELD;
+-		}
+-		if (sectors > dev->dev_attrib.hw_max_sectors) {
+-			printk_ratelimited(KERN_ERR "SCSI OP %02xh with too"
+-				" big sectors %u exceeds backend hw_max_sectors:"
+-				" %u\n", cdb[0], sectors,
+-				dev->dev_attrib.hw_max_sectors);
+-			return TCM_INVALID_CDB_FIELD;
+-		}
+-
+ 		end_lba = dev->transport->get_blocks(dev) + 1;
+ 		if (cmd->t_task_lba + sectors > end_lba) {
+ 			pr_err("cmd exceeds last lba %llu "
+--- a/drivers/target/target_core_spc.c
++++ b/drivers/target/target_core_spc.c
+@@ -503,7 +503,6 @@ static sense_reason_t
+ spc_emulate_evpd_b0(struct se_cmd *cmd, unsigned char *buf)
+ {
+ 	struct se_device *dev = cmd->se_dev;
+-	u32 max_sectors;
+ 	int have_tp = 0;
+ 	int opt, min;
+ 
+@@ -537,9 +536,7 @@ spc_emulate_evpd_b0(struct se_cmd *cmd,
+ 	/*
+ 	 * Set MAXIMUM TRANSFER LENGTH
+ 	 */
+-	max_sectors = min(dev->dev_attrib.fabric_max_sectors,
+-			  dev->dev_attrib.hw_max_sectors);
+-	put_unaligned_be32(max_sectors, &buf[8]);
++	put_unaligned_be32(dev->dev_attrib.hw_max_sectors, &buf[8]);
+ 
+ 	/*
+ 	 * Set OPTIMAL TRANSFER LENGTH
diff --git a/queue-3.14/workqueue-fix-subtle-pool-management-issue-which-can-stall-whole-worker_pool.patch b/queue-3.14/workqueue-fix-subtle-pool-management-issue-which-can-stall-whole-worker_pool.patch
new file mode 100644
index 00000000000..bdd72dce5db
--- /dev/null
+++ b/queue-3.14/workqueue-fix-subtle-pool-management-issue-which-can-stall-whole-worker_pool.patch
@@ -0,0 +1,199 @@
+From 29187a9eeaf362d8422e62e17a22a6e115277a49 Mon Sep 17 00:00:00 2001
+From: Tejun Heo <tj@kernel.org>
+Date: Fri, 16 Jan 2015 14:21:16 -0500
+Subject: workqueue: fix subtle pool management issue which can stall whole worker_pool
+
+From: Tejun Heo <tj@kernel.org>
+
+commit 29187a9eeaf362d8422e62e17a22a6e115277a49 upstream.
+
+A worker_pool's forward progress is guaranteed by the fact that the
+last idle worker assumes the manager role to create more workers and
+summon the rescuers if creating workers doesn't succeed in timely
+manner before proceeding to execute work items.
+
+This manager role is implemented in manage_workers(), which indicates
+whether the worker may proceed to work item execution with its return
+value.  This is necessary because multiple workers may contend for the
+manager role, and, if there already is a manager, others should
+proceed to work item execution.
+
+Unfortunately, the function also indicates that the worker may proceed
+to work item execution if need_to_create_worker() is false at the head
+of the function.  need_to_create_worker() tests the following
+conditions.
+
+	pending work items && !nr_running && !nr_idle
+
+The first and third conditions are protected by pool->lock and thus
+won't change while holding pool->lock; however, nr_running can change
+asynchronously as other workers block and resume and while it's likely
+to be zero, as someone woke this worker up in the first place, some
+other workers could have become runnable inbetween making it non-zero.
+
+If this happens, manage_worker() could return false even with zero
+nr_idle making the worker, the last idle one, proceed to execute work
+items.  If then all workers of the pool end up blocking on a resource
+which can only be released by a work item which is pending on that
+pool, the whole pool can deadlock as there's no one to create more
+workers or summon the rescuers.
+
+This patch fixes the problem by removing the early exit condition from
+maybe_create_worker() and making manage_workers() return false iff
+there's already another manager, which ensures that the last worker
+doesn't start executing work items.
+
+We can leave the early exit condition alone and just ignore the return
+value but the only reason it was put there is because the
+manage_workers() used to perform both creations and destructions of
+workers and thus the function may be invoked while the pool is trying
+to reduce the number of workers.  Now that manage_workers() is called
+only when more workers are needed, the only case this early exit
+condition is triggered is rare race conditions rendering it pointless.
+
+Tested with simulated workload and modified workqueue code which
+trigger the pool deadlock reliably without this patch.
+
+tj: Updated to v3.14 where manage_workers() is responsible not only
+    for creating more workers but also destroying surplus ones.
+    maybe_create_worker() needs to keep its early exit condition to
+    avoid creating a new worker when manage_workers() is called to
+    destroy surplus ones.  Other than that, the adaptabion is
+    straight-forward.  Both maybe_{create|destroy}_worker() functions
+    are converted to return void and manage_workers() returns %false
+    iff it lost manager arbitration.
+
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Reported-by: Eric Sandeen <sandeen@sandeen.net>
+Link: http://lkml.kernel.org/g/54B019F4.8030009@sandeen.net
+Cc: Dave Chinner <david@fromorbit.com>
+Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ kernel/workqueue.c |   42 +++++++++++++-----------------------------
+ 1 file changed, 13 insertions(+), 29 deletions(-)
+
+--- a/kernel/workqueue.c
++++ b/kernel/workqueue.c
+@@ -1962,17 +1962,13 @@ static void pool_mayday_timeout(unsigned
+  * spin_lock_irq(pool->lock) which may be released and regrabbed
+  * multiple times.  Does GFP_KERNEL allocations.  Called only from
+  * manager.
+- *
+- * Return:
+- * %false if no action was taken and pool->lock stayed locked, %true
+- * otherwise.
+  */
+-static bool maybe_create_worker(struct worker_pool *pool)
++static void maybe_create_worker(struct worker_pool *pool)
+ __releases(&pool->lock)
+ __acquires(&pool->lock)
+ {
+ 	if (!need_to_create_worker(pool))
+-		return false;
++		return;
+ restart:
+ 	spin_unlock_irq(&pool->lock);
+ 
+@@ -1989,7 +1985,7 @@ restart:
+ 			start_worker(worker);
+ 			if (WARN_ON_ONCE(need_to_create_worker(pool)))
+ 				goto restart;
+-			return true;
++			return;
+ 		}
+ 
+ 		if (!need_to_create_worker(pool))
+@@ -2006,7 +2002,7 @@ restart:
+ 	spin_lock_irq(&pool->lock);
+ 	if (need_to_create_worker(pool))
+ 		goto restart;
+-	return true;
++	return;
+ }
+ 
+ /**
+@@ -2019,15 +2015,9 @@ restart:
+  * LOCKING:
+  * spin_lock_irq(pool->lock) which may be released and regrabbed
+  * multiple times.  Called only from manager.
+- *
+- * Return:
+- * %false if no action was taken and pool->lock stayed locked, %true
+- * otherwise.
+  */
+-static bool maybe_destroy_workers(struct worker_pool *pool)
++static void maybe_destroy_workers(struct worker_pool *pool)
+ {
+-	bool ret = false;
+-
+ 	while (too_many_workers(pool)) {
+ 		struct worker *worker;
+ 		unsigned long expires;
+@@ -2041,10 +2031,7 @@ static bool maybe_destroy_workers(struct
+ 		}
+ 
+ 		destroy_worker(worker);
+-		ret = true;
+ 	}
+-
+-	return ret;
+ }
+ 
+ /**
+@@ -2064,16 +2051,14 @@ static bool maybe_destroy_workers(struct
+  * multiple times.  Does GFP_KERNEL allocations.
+  *
+  * Return:
+- * %false if the pool don't need management and the caller can safely start
+- * processing works, %true indicates that the function released pool->lock
+- * and reacquired it to perform some management function and that the
+- * conditions that the caller verified while holding the lock before
+- * calling the function might no longer be true.
++ * %false if the pool doesn't need management and the caller can safely
++ * start processing works, %true if management function was performed and
++ * the conditions that the caller verified before calling the function may
++ * no longer be true.
+  */
+ static bool manage_workers(struct worker *worker)
+ {
+ 	struct worker_pool *pool = worker->pool;
+-	bool ret = false;
+ 
+ 	/*
+ 	 * Managership is governed by two mutexes - manager_arb and
+@@ -2097,7 +2082,7 @@ static bool manage_workers(struct worker
+ 	 * manager_mutex.
+ 	 */
+ 	if (!mutex_trylock(&pool->manager_arb))
+-		return ret;
++		return false;
+ 
+ 	/*
+ 	 * With manager arbitration won, manager_mutex would be free in
+@@ -2107,7 +2092,6 @@ static bool manage_workers(struct worker
+ 		spin_unlock_irq(&pool->lock);
+ 		mutex_lock(&pool->manager_mutex);
+ 		spin_lock_irq(&pool->lock);
+-		ret = true;
+ 	}
+ 
+ 	pool->flags &= ~POOL_MANAGE_WORKERS;
+@@ -2116,12 +2100,12 @@ static bool manage_workers(struct worker
+ 	 * Destroy and then create so that may_start_working() is true
+ 	 * on return.
+ 	 */
+-	ret |= maybe_destroy_workers(pool);
+-	ret |= maybe_create_worker(pool);
++	maybe_destroy_workers(pool);
++	maybe_create_worker(pool);
+ 
+ 	mutex_unlock(&pool->manager_mutex);
+ 	mutex_unlock(&pool->manager_arb);
+-	return ret;
++	return true;
+ }
+ 
+ /**
-- 
2.47.3