From 032f48ad7455b646e75581d2e21159648a5881fe Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 3 Feb 2015 15:11:48 -0800 Subject: [PATCH] 3.14-stable patches added patches: rbd-fix-rbd_dev_parent_get-when-parent_overlap-0.patch target-drop-arbitrary-maximum-i-o-size-limit.patch workqueue-fix-subtle-pool-management-issue-which-can-stall-whole-worker_pool.patch --- ...dev_parent_get-when-parent_overlap-0.patch | 94 +++++++++ queue-3.14/series | 3 + ...rop-arbitrary-maximum-i-o-size-limit.patch | 159 ++++++++++++++ ...ue-which-can-stall-whole-worker_pool.patch | 199 ++++++++++++++++++ 4 files changed, 455 insertions(+) create mode 100644 queue-3.14/rbd-fix-rbd_dev_parent_get-when-parent_overlap-0.patch create mode 100644 queue-3.14/target-drop-arbitrary-maximum-i-o-size-limit.patch create mode 100644 queue-3.14/workqueue-fix-subtle-pool-management-issue-which-can-stall-whole-worker_pool.patch diff --git a/queue-3.14/rbd-fix-rbd_dev_parent_get-when-parent_overlap-0.patch b/queue-3.14/rbd-fix-rbd_dev_parent_get-when-parent_overlap-0.patch new file mode 100644 index 00000000000..32beda58f7e --- /dev/null +++ b/queue-3.14/rbd-fix-rbd_dev_parent_get-when-parent_overlap-0.patch @@ -0,0 +1,94 @@ +From ae43e9d05eb4bd324155292f889fbd001c4faea8 Mon Sep 17 00:00:00 2001 +From: Ilya Dryomov +Date: Mon, 19 Jan 2015 18:13:43 +0300 +Subject: rbd: fix rbd_dev_parent_get() when parent_overlap == 0 + +From: Ilya Dryomov + +commit ae43e9d05eb4bd324155292f889fbd001c4faea8 upstream. + +The comment for rbd_dev_parent_get() said + + * We must get the reference before checking for the overlap to + * coordinate properly with zeroing the parent overlap in + * rbd_dev_v2_parent_info() when an image gets flattened. We + * drop it again if there is no overlap. + +but the "drop it again if there is no overlap" part was missing from +the implementation. This lead to absurd parent_ref values for images +with parent_overlap == 0, as parent_ref was incremented for each +img_request and virtually never decremented. + +Fix this by leveraging the fact that refresh path calls +rbd_dev_v2_parent_info() under header_rwsem and use it for read in +rbd_dev_parent_get(), instead of messing around with atomics. Get rid +of barriers in rbd_dev_v2_parent_info() while at it - I don't see what +they'd pair with now and I suspect we are in a pretty miserable +situation as far as proper locking goes regardless. + +Signed-off-by: Ilya Dryomov +Reviewed-by: Josh Durgin +Reviewed-by: Alex Elder +[idryomov@redhat.com: backport to 3.14: context] +Signed-off-by: Greg Kroah-Hartman +--- + drivers/block/rbd.c | 20 ++++++-------------- + 1 file changed, 6 insertions(+), 14 deletions(-) + +--- a/drivers/block/rbd.c ++++ b/drivers/block/rbd.c +@@ -1926,32 +1926,26 @@ static void rbd_dev_parent_put(struct rb + * If an image has a non-zero parent overlap, get a reference to its + * parent. + * +- * We must get the reference before checking for the overlap to +- * coordinate properly with zeroing the parent overlap in +- * rbd_dev_v2_parent_info() when an image gets flattened. We +- * drop it again if there is no overlap. +- * + * Returns true if the rbd device has a parent with a non-zero + * overlap and a reference for it was successfully taken, or + * false otherwise. + */ + static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) + { +- int counter; ++ int counter = 0; + + if (!rbd_dev->parent_spec) + return false; + +- counter = atomic_inc_return_safe(&rbd_dev->parent_ref); +- if (counter > 0 && rbd_dev->parent_overlap) +- return true; +- +- /* Image was flattened, but parent is not yet torn down */ ++ down_read(&rbd_dev->header_rwsem); ++ if (rbd_dev->parent_overlap) ++ counter = atomic_inc_return_safe(&rbd_dev->parent_ref); ++ up_read(&rbd_dev->header_rwsem); + + if (counter < 0) + rbd_warn(rbd_dev, "parent reference overflow\n"); + +- return false; ++ return counter > 0; + } + + /* +@@ -3904,7 +3898,6 @@ static int rbd_dev_v2_parent_info(struct + */ + if (rbd_dev->parent_overlap) { + rbd_dev->parent_overlap = 0; +- smp_mb(); + rbd_dev_parent_put(rbd_dev); + pr_info("%s: clone image has been flattened\n", + rbd_dev->disk->disk_name); +@@ -3948,7 +3941,6 @@ static int rbd_dev_v2_parent_info(struct + * treat it specially. + */ + rbd_dev->parent_overlap = overlap; +- smp_mb(); + if (!overlap) { + + /* A null parent_spec indicates it's the initial probe */ diff --git a/queue-3.14/series b/queue-3.14/series index 01394ccd467..55802529148 100644 --- a/queue-3.14/series +++ b/queue-3.14/series @@ -28,3 +28,6 @@ arm-dma-ensure-that-old-section-mappings-are-flushed-from-the-tlb.patch pstore-clarify-clearing-of-_read_cnt-in-ramoops_context.patch pstore-skip-zero-size-persistent-ram-buffer-in-traverse.patch pstore-fix-null-pointer-fault-if-get-null-prz-in-ramoops_get_next_prz.patch +rbd-fix-rbd_dev_parent_get-when-parent_overlap-0.patch +workqueue-fix-subtle-pool-management-issue-which-can-stall-whole-worker_pool.patch +target-drop-arbitrary-maximum-i-o-size-limit.patch diff --git a/queue-3.14/target-drop-arbitrary-maximum-i-o-size-limit.patch b/queue-3.14/target-drop-arbitrary-maximum-i-o-size-limit.patch new file mode 100644 index 00000000000..badff3bcd13 --- /dev/null +++ b/queue-3.14/target-drop-arbitrary-maximum-i-o-size-limit.patch @@ -0,0 +1,159 @@ +From 046ba64285a4389ae5e9a7dfa253c6bff3d7c341 Mon Sep 17 00:00:00 2001 +From: Nicholas Bellinger +Date: Tue, 6 Jan 2015 16:10:37 -0800 +Subject: target: Drop arbitrary maximum I/O size limit + +From: Nicholas Bellinger + +commit 046ba64285a4389ae5e9a7dfa253c6bff3d7c341 upstream. + +This patch drops the arbitrary maximum I/O size limit in sbc_parse_cdb(), +which currently for fabric_max_sectors is hardcoded to 8192 (4 MB for 512 +byte sector devices), and for hw_max_sectors is a backend driver dependent +value. + +This limit is problematic because Linux initiators have only recently +started to honor block limits MAXIMUM TRANSFER LENGTH, and other non-Linux +based initiators (eg: MSFT Fibre Channel) can also generate I/Os larger +than 4 MB in size. + +Currently when this happens, the following message will appear on the +target resulting in I/Os being returned with non recoverable status: + + SCSI OP 28h with too big sectors 16384 exceeds fabric_max_sectors: 8192 + +Instead, drop both [fabric,hw]_max_sector checks in sbc_parse_cdb(), +and convert the existing hw_max_sectors into a purely informational +attribute used to represent the granuality that backend driver and/or +subsystem code is splitting I/Os upon. + +Also, update FILEIO with an explicit FD_MAX_BYTES check in fd_execute_rw() +to deal with the one special iovec limitiation case. + +v2 changes: + - Drop hw_max_sectors check in sbc_parse_cdb() + +Reported-by: Lance Gropper +Reported-by: Stefan Priebe +Cc: Christoph Hellwig +Cc: Martin K. Petersen +Cc: Roland Dreier +Signed-off-by: Nicholas Bellinger +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/target/target_core_device.c | 8 ++++---- + drivers/target/target_core_file.c | 11 ++++++++++- + drivers/target/target_core_iblock.c | 2 +- + drivers/target/target_core_sbc.c | 15 --------------- + drivers/target/target_core_spc.c | 5 +---- + 5 files changed, 16 insertions(+), 25 deletions(-) + +--- a/drivers/target/target_core_device.c ++++ b/drivers/target/target_core_device.c +@@ -1153,10 +1153,10 @@ int se_dev_set_optimal_sectors(struct se + " changed for TCM/pSCSI\n", dev); + return -EINVAL; + } +- if (optimal_sectors > dev->dev_attrib.fabric_max_sectors) { ++ if (optimal_sectors > dev->dev_attrib.hw_max_sectors) { + pr_err("dev[%p]: Passed optimal_sectors %u cannot be" +- " greater than fabric_max_sectors: %u\n", dev, +- optimal_sectors, dev->dev_attrib.fabric_max_sectors); ++ " greater than hw_max_sectors: %u\n", dev, ++ optimal_sectors, dev->dev_attrib.hw_max_sectors); + return -EINVAL; + } + +@@ -1565,7 +1565,6 @@ struct se_device *target_alloc_device(st + DA_UNMAP_GRANULARITY_ALIGNMENT_DEFAULT; + dev->dev_attrib.max_write_same_len = DA_MAX_WRITE_SAME_LEN; + dev->dev_attrib.fabric_max_sectors = DA_FABRIC_MAX_SECTORS; +- dev->dev_attrib.optimal_sectors = DA_FABRIC_MAX_SECTORS; + + xcopy_lun = &dev->xcopy_lun; + xcopy_lun->lun_se_dev = dev; +@@ -1606,6 +1605,7 @@ int target_configure_device(struct se_de + dev->dev_attrib.hw_max_sectors = + se_dev_align_max_sectors(dev->dev_attrib.hw_max_sectors, + dev->dev_attrib.hw_block_size); ++ dev->dev_attrib.optimal_sectors = dev->dev_attrib.hw_max_sectors; + + dev->dev_index = scsi_get_new_index(SCSI_DEVICE_INDEX); + dev->creation_time = get_jiffies_64(); +--- a/drivers/target/target_core_file.c ++++ b/drivers/target/target_core_file.c +@@ -620,7 +620,16 @@ fd_execute_rw(struct se_cmd *cmd, struct + struct fd_prot fd_prot; + sense_reason_t rc; + int ret = 0; +- ++ /* ++ * We are currently limited by the number of iovecs (2048) per ++ * single vfs_[writev,readv] call. ++ */ ++ if (cmd->data_length > FD_MAX_BYTES) { ++ pr_err("FILEIO: Not able to process I/O of %u bytes due to" ++ "FD_MAX_BYTES: %u iovec count limitiation\n", ++ cmd->data_length, FD_MAX_BYTES); ++ return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; ++ } + /* + * Call vectorized fileio functions to map struct scatterlist + * physical memory addresses to struct iovec virtual memory. +--- a/drivers/target/target_core_iblock.c ++++ b/drivers/target/target_core_iblock.c +@@ -123,7 +123,7 @@ static int iblock_configure_device(struc + q = bdev_get_queue(bd); + + dev->dev_attrib.hw_block_size = bdev_logical_block_size(bd); +- dev->dev_attrib.hw_max_sectors = UINT_MAX; ++ dev->dev_attrib.hw_max_sectors = queue_max_hw_sectors(q); + dev->dev_attrib.hw_queue_depth = q->nr_requests; + + /* +--- a/drivers/target/target_core_sbc.c ++++ b/drivers/target/target_core_sbc.c +@@ -910,21 +910,6 @@ sbc_parse_cdb(struct se_cmd *cmd, struct + if (cmd->se_cmd_flags & SCF_SCSI_DATA_CDB) { + unsigned long long end_lba; + +- if (sectors > dev->dev_attrib.fabric_max_sectors) { +- printk_ratelimited(KERN_ERR "SCSI OP %02xh with too" +- " big sectors %u exceeds fabric_max_sectors:" +- " %u\n", cdb[0], sectors, +- dev->dev_attrib.fabric_max_sectors); +- return TCM_INVALID_CDB_FIELD; +- } +- if (sectors > dev->dev_attrib.hw_max_sectors) { +- printk_ratelimited(KERN_ERR "SCSI OP %02xh with too" +- " big sectors %u exceeds backend hw_max_sectors:" +- " %u\n", cdb[0], sectors, +- dev->dev_attrib.hw_max_sectors); +- return TCM_INVALID_CDB_FIELD; +- } +- + end_lba = dev->transport->get_blocks(dev) + 1; + if (cmd->t_task_lba + sectors > end_lba) { + pr_err("cmd exceeds last lba %llu " +--- a/drivers/target/target_core_spc.c ++++ b/drivers/target/target_core_spc.c +@@ -503,7 +503,6 @@ static sense_reason_t + spc_emulate_evpd_b0(struct se_cmd *cmd, unsigned char *buf) + { + struct se_device *dev = cmd->se_dev; +- u32 max_sectors; + int have_tp = 0; + int opt, min; + +@@ -537,9 +536,7 @@ spc_emulate_evpd_b0(struct se_cmd *cmd, + /* + * Set MAXIMUM TRANSFER LENGTH + */ +- max_sectors = min(dev->dev_attrib.fabric_max_sectors, +- dev->dev_attrib.hw_max_sectors); +- put_unaligned_be32(max_sectors, &buf[8]); ++ put_unaligned_be32(dev->dev_attrib.hw_max_sectors, &buf[8]); + + /* + * Set OPTIMAL TRANSFER LENGTH diff --git a/queue-3.14/workqueue-fix-subtle-pool-management-issue-which-can-stall-whole-worker_pool.patch b/queue-3.14/workqueue-fix-subtle-pool-management-issue-which-can-stall-whole-worker_pool.patch new file mode 100644 index 00000000000..bdd72dce5db --- /dev/null +++ b/queue-3.14/workqueue-fix-subtle-pool-management-issue-which-can-stall-whole-worker_pool.patch @@ -0,0 +1,199 @@ +From 29187a9eeaf362d8422e62e17a22a6e115277a49 Mon Sep 17 00:00:00 2001 +From: Tejun Heo +Date: Fri, 16 Jan 2015 14:21:16 -0500 +Subject: workqueue: fix subtle pool management issue which can stall whole worker_pool + +From: Tejun Heo + +commit 29187a9eeaf362d8422e62e17a22a6e115277a49 upstream. + +A worker_pool's forward progress is guaranteed by the fact that the +last idle worker assumes the manager role to create more workers and +summon the rescuers if creating workers doesn't succeed in timely +manner before proceeding to execute work items. + +This manager role is implemented in manage_workers(), which indicates +whether the worker may proceed to work item execution with its return +value. This is necessary because multiple workers may contend for the +manager role, and, if there already is a manager, others should +proceed to work item execution. + +Unfortunately, the function also indicates that the worker may proceed +to work item execution if need_to_create_worker() is false at the head +of the function. need_to_create_worker() tests the following +conditions. + + pending work items && !nr_running && !nr_idle + +The first and third conditions are protected by pool->lock and thus +won't change while holding pool->lock; however, nr_running can change +asynchronously as other workers block and resume and while it's likely +to be zero, as someone woke this worker up in the first place, some +other workers could have become runnable inbetween making it non-zero. + +If this happens, manage_worker() could return false even with zero +nr_idle making the worker, the last idle one, proceed to execute work +items. If then all workers of the pool end up blocking on a resource +which can only be released by a work item which is pending on that +pool, the whole pool can deadlock as there's no one to create more +workers or summon the rescuers. + +This patch fixes the problem by removing the early exit condition from +maybe_create_worker() and making manage_workers() return false iff +there's already another manager, which ensures that the last worker +doesn't start executing work items. + +We can leave the early exit condition alone and just ignore the return +value but the only reason it was put there is because the +manage_workers() used to perform both creations and destructions of +workers and thus the function may be invoked while the pool is trying +to reduce the number of workers. Now that manage_workers() is called +only when more workers are needed, the only case this early exit +condition is triggered is rare race conditions rendering it pointless. + +Tested with simulated workload and modified workqueue code which +trigger the pool deadlock reliably without this patch. + +tj: Updated to v3.14 where manage_workers() is responsible not only + for creating more workers but also destroying surplus ones. + maybe_create_worker() needs to keep its early exit condition to + avoid creating a new worker when manage_workers() is called to + destroy surplus ones. Other than that, the adaptabion is + straight-forward. Both maybe_{create|destroy}_worker() functions + are converted to return void and manage_workers() returns %false + iff it lost manager arbitration. + +Signed-off-by: Tejun Heo +Reported-by: Eric Sandeen +Link: http://lkml.kernel.org/g/54B019F4.8030009@sandeen.net +Cc: Dave Chinner +Cc: Lai Jiangshan +Signed-off-by: Greg Kroah-Hartman + + +--- + kernel/workqueue.c | 42 +++++++++++++----------------------------- + 1 file changed, 13 insertions(+), 29 deletions(-) + +--- a/kernel/workqueue.c ++++ b/kernel/workqueue.c +@@ -1962,17 +1962,13 @@ static void pool_mayday_timeout(unsigned + * spin_lock_irq(pool->lock) which may be released and regrabbed + * multiple times. Does GFP_KERNEL allocations. Called only from + * manager. +- * +- * Return: +- * %false if no action was taken and pool->lock stayed locked, %true +- * otherwise. + */ +-static bool maybe_create_worker(struct worker_pool *pool) ++static void maybe_create_worker(struct worker_pool *pool) + __releases(&pool->lock) + __acquires(&pool->lock) + { + if (!need_to_create_worker(pool)) +- return false; ++ return; + restart: + spin_unlock_irq(&pool->lock); + +@@ -1989,7 +1985,7 @@ restart: + start_worker(worker); + if (WARN_ON_ONCE(need_to_create_worker(pool))) + goto restart; +- return true; ++ return; + } + + if (!need_to_create_worker(pool)) +@@ -2006,7 +2002,7 @@ restart: + spin_lock_irq(&pool->lock); + if (need_to_create_worker(pool)) + goto restart; +- return true; ++ return; + } + + /** +@@ -2019,15 +2015,9 @@ restart: + * LOCKING: + * spin_lock_irq(pool->lock) which may be released and regrabbed + * multiple times. Called only from manager. +- * +- * Return: +- * %false if no action was taken and pool->lock stayed locked, %true +- * otherwise. + */ +-static bool maybe_destroy_workers(struct worker_pool *pool) ++static void maybe_destroy_workers(struct worker_pool *pool) + { +- bool ret = false; +- + while (too_many_workers(pool)) { + struct worker *worker; + unsigned long expires; +@@ -2041,10 +2031,7 @@ static bool maybe_destroy_workers(struct + } + + destroy_worker(worker); +- ret = true; + } +- +- return ret; + } + + /** +@@ -2064,16 +2051,14 @@ static bool maybe_destroy_workers(struct + * multiple times. Does GFP_KERNEL allocations. + * + * Return: +- * %false if the pool don't need management and the caller can safely start +- * processing works, %true indicates that the function released pool->lock +- * and reacquired it to perform some management function and that the +- * conditions that the caller verified while holding the lock before +- * calling the function might no longer be true. ++ * %false if the pool doesn't need management and the caller can safely ++ * start processing works, %true if management function was performed and ++ * the conditions that the caller verified before calling the function may ++ * no longer be true. + */ + static bool manage_workers(struct worker *worker) + { + struct worker_pool *pool = worker->pool; +- bool ret = false; + + /* + * Managership is governed by two mutexes - manager_arb and +@@ -2097,7 +2082,7 @@ static bool manage_workers(struct worker + * manager_mutex. + */ + if (!mutex_trylock(&pool->manager_arb)) +- return ret; ++ return false; + + /* + * With manager arbitration won, manager_mutex would be free in +@@ -2107,7 +2092,6 @@ static bool manage_workers(struct worker + spin_unlock_irq(&pool->lock); + mutex_lock(&pool->manager_mutex); + spin_lock_irq(&pool->lock); +- ret = true; + } + + pool->flags &= ~POOL_MANAGE_WORKERS; +@@ -2116,12 +2100,12 @@ static bool manage_workers(struct worker + * Destroy and then create so that may_start_working() is true + * on return. + */ +- ret |= maybe_destroy_workers(pool); +- ret |= maybe_create_worker(pool); ++ maybe_destroy_workers(pool); ++ maybe_create_worker(pool); + + mutex_unlock(&pool->manager_mutex); + mutex_unlock(&pool->manager_arb); +- return ret; ++ return true; + } + + /** -- 2.47.3