From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sun, 7 Jun 2015 00:00:59 +0000 (-0700)
Subject: 4.0-stable patches
X-Git-Tag: v3.10.81~20
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=d856161315896b9d6032ca6f09b6c5b6fd893796;p=thirdparty%2Fkernel%2Fstable-queue.git

4.0-stable patches

added patches:
	aio-fix-serial-draining-in-exit_aio.patch
	crush-ensuring-at-most-num-rep-osds-are-selected.patch
---

diff --git a/queue-4.0/aio-fix-serial-draining-in-exit_aio.patch b/queue-4.0/aio-fix-serial-draining-in-exit_aio.patch
new file mode 100644
index 00000000000..2df4bd1d1c3
--- /dev/null
+++ b/queue-4.0/aio-fix-serial-draining-in-exit_aio.patch
@@ -0,0 +1,152 @@
+From dc48e56d761610da4ea1088d1bea0a030b8e3e43 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@fb.com>
+Date: Wed, 15 Apr 2015 11:17:23 -0600
+Subject: aio: fix serial draining in exit_aio()
+
+From: Jens Axboe <axboe@fb.com>
+
+commit dc48e56d761610da4ea1088d1bea0a030b8e3e43 upstream.
+
+exit_aio() currently serializes killing io contexts. Each context
+killing ends up having to do percpu_ref_kill(), which in turns has
+to wait for an RCU grace period. This can take a long time, depending
+on the number of contexts. And there's no point in doing them serially,
+when we could be waiting for all of them in one fell swoop.
+
+This patches makes my fio thread offload test case exit 0.2s instead
+of almost 6s.
+
+Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
+Signed-off-by: Jens Axboe <axboe@fb.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/aio.c |   45 ++++++++++++++++++++++++++++++---------------
+ 1 file changed, 30 insertions(+), 15 deletions(-)
+
+--- a/fs/aio.c
++++ b/fs/aio.c
+@@ -77,6 +77,11 @@ struct kioctx_cpu {
+ 	unsigned		reqs_available;
+ };
+ 
++struct ctx_rq_wait {
++	struct completion comp;
++	atomic_t count;
++};
++
+ struct kioctx {
+ 	struct percpu_ref	users;
+ 	atomic_t		dead;
+@@ -115,7 +120,7 @@ struct kioctx {
+ 	/*
+ 	 * signals when all in-flight requests are done
+ 	 */
+-	struct completion *requests_done;
++	struct ctx_rq_wait	*rq_wait;
+ 
+ 	struct {
+ 		/*
+@@ -539,8 +544,8 @@ static void free_ioctx_reqs(struct percp
+ 	struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
+ 
+ 	/* At this point we know that there are no any in-flight requests */
+-	if (ctx->requests_done)
+-		complete(ctx->requests_done);
++	if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
++		complete(&ctx->rq_wait->comp);
+ 
+ 	INIT_WORK(&ctx->free_work, free_ioctx);
+ 	schedule_work(&ctx->free_work);
+@@ -751,7 +756,7 @@ err:
+  *	the rapid destruction of the kioctx.
+  */
+ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
+-		struct completion *requests_done)
++		      struct ctx_rq_wait *wait)
+ {
+ 	struct kioctx_table *table;
+ 
+@@ -781,7 +786,7 @@ static int kill_ioctx(struct mm_struct *
+ 	if (ctx->mmap_size)
+ 		vm_munmap(ctx->mmap_base, ctx->mmap_size);
+ 
+-	ctx->requests_done = requests_done;
++	ctx->rq_wait = wait;
+ 	percpu_ref_kill(&ctx->users);
+ 	return 0;
+ }
+@@ -813,18 +818,24 @@ EXPORT_SYMBOL(wait_on_sync_kiocb);
+ void exit_aio(struct mm_struct *mm)
+ {
+ 	struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table);
+-	int i;
++	struct ctx_rq_wait wait;
++	int i, skipped;
+ 
+ 	if (!table)
+ 		return;
+ 
++	atomic_set(&wait.count, table->nr);
++	init_completion(&wait.comp);
++
++	skipped = 0;
+ 	for (i = 0; i < table->nr; ++i) {
+ 		struct kioctx *ctx = table->table[i];
+-		struct completion requests_done =
+-			COMPLETION_INITIALIZER_ONSTACK(requests_done);
+ 
+-		if (!ctx)
++		if (!ctx) {
++			skipped++;
+ 			continue;
++		}
++
+ 		/*
+ 		 * We don't need to bother with munmap() here - exit_mmap(mm)
+ 		 * is coming and it'll unmap everything. And we simply can't,
+@@ -833,10 +844,12 @@ void exit_aio(struct mm_struct *mm)
+ 		 * that it needs to unmap the area, just set it to 0.
+ 		 */
+ 		ctx->mmap_size = 0;
+-		kill_ioctx(mm, ctx, &requests_done);
++		kill_ioctx(mm, ctx, &wait);
++	}
+ 
++	if (!atomic_sub_and_test(skipped, &wait.count)) {
+ 		/* Wait until all IO for the context are done. */
+-		wait_for_completion(&requests_done);
++		wait_for_completion(&wait.comp);
+ 	}
+ 
+ 	RCU_INIT_POINTER(mm->ioctx_table, NULL);
+@@ -1321,15 +1334,17 @@ SYSCALL_DEFINE1(io_destroy, aio_context_
+ {
+ 	struct kioctx *ioctx = lookup_ioctx(ctx);
+ 	if (likely(NULL != ioctx)) {
+-		struct completion requests_done =
+-			COMPLETION_INITIALIZER_ONSTACK(requests_done);
++		struct ctx_rq_wait wait;
+ 		int ret;
+ 
++		init_completion(&wait.comp);
++		atomic_set(&wait.count, 1);
++
+ 		/* Pass requests_done to kill_ioctx() where it can be set
+ 		 * in a thread-safe way. If we try to set it here then we have
+ 		 * a race condition if two io_destroy() called simultaneously.
+ 		 */
+-		ret = kill_ioctx(current->mm, ioctx, &requests_done);
++		ret = kill_ioctx(current->mm, ioctx, &wait);
+ 		percpu_ref_put(&ioctx->users);
+ 
+ 		/* Wait until all IO for the context are done. Otherwise kernel
+@@ -1337,7 +1352,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_
+ 		 * is destroyed.
+ 		 */
+ 		if (!ret)
+-			wait_for_completion(&requests_done);
++			wait_for_completion(&wait.comp);
+ 
+ 		return ret;
+ 	}
diff --git a/queue-4.0/crush-ensuring-at-most-num-rep-osds-are-selected.patch b/queue-4.0/crush-ensuring-at-most-num-rep-osds-are-selected.patch
new file mode 100644
index 00000000000..171e1c05a2f
--- /dev/null
+++ b/queue-4.0/crush-ensuring-at-most-num-rep-osds-are-selected.patch
@@ -0,0 +1,117 @@
+From 45002267e8d2699bf9b022315bee3dd13b044843 Mon Sep 17 00:00:00 2001
+From: Ilya Dryomov <idryomov@gmail.com>
+Date: Tue, 14 Apr 2015 16:04:23 +0300
+Subject: crush: ensuring at most num-rep osds are selected
+
+From: Ilya Dryomov <idryomov@gmail.com>
+
+commit 45002267e8d2699bf9b022315bee3dd13b044843 upstream.
+
+Crush temporary buffers are allocated as per replica size configured
+by the user.  When there are more final osds (to be selected as per
+rule) than the replicas, buffer overlaps and it causes crash.  Now, it
+ensures that at most num-rep osds are selected even if more number of
+osds are allowed by the rule.
+
+Reflects ceph.git commits 6b4d1aa99718e3b367496326c1e64551330fabc0,
+                          234b066ba04976783d15ff2abc3e81b6cc06fb10.
+
+Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/ceph/crush/mapper.c |   16 ++++++++++++----
+ 1 file changed, 12 insertions(+), 4 deletions(-)
+
+--- a/net/ceph/crush/mapper.c
++++ b/net/ceph/crush/mapper.c
+@@ -290,6 +290,7 @@ static int is_out(const struct crush_map
+  * @type: the type of item to choose
+  * @out: pointer to output vector
+  * @outpos: our position in that vector
++ * @out_size: size of the out vector
+  * @tries: number of attempts to make
+  * @recurse_tries: number of attempts to have recursive chooseleaf make
+  * @local_retries: localized retries
+@@ -304,6 +305,7 @@ static int crush_choose_firstn(const str
+ 			       const __u32 *weight, int weight_max,
+ 			       int x, int numrep, int type,
+ 			       int *out, int outpos,
++			       int out_size,
+ 			       unsigned int tries,
+ 			       unsigned int recurse_tries,
+ 			       unsigned int local_retries,
+@@ -322,6 +324,7 @@ static int crush_choose_firstn(const str
+ 	int item = 0;
+ 	int itemtype;
+ 	int collide, reject;
++	int count = out_size;
+ 
+ 	dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n",
+ 		recurse_to_leaf ? "_LEAF" : "",
+@@ -329,7 +332,7 @@ static int crush_choose_firstn(const str
+ 		tries, recurse_tries, local_retries, local_fallback_retries,
+ 		parent_r);
+ 
+-	for (rep = outpos; rep < numrep; rep++) {
++	for (rep = outpos; rep < numrep && count > 0 ; rep++) {
+ 		/* keep trying until we get a non-out, non-colliding item */
+ 		ftotal = 0;
+ 		skip_rep = 0;
+@@ -403,7 +406,7 @@ static int crush_choose_firstn(const str
+ 							 map->buckets[-1-item],
+ 							 weight, weight_max,
+ 							 x, outpos+1, 0,
+-							 out2, outpos,
++							 out2, outpos, count,
+ 							 recurse_tries, 0,
+ 							 local_retries,
+ 							 local_fallback_retries,
+@@ -463,6 +466,7 @@ reject:
+ 		dprintk("CHOOSE got %d\n", item);
+ 		out[outpos] = item;
+ 		outpos++;
++		count--;
+ 	}
+ 
+ 	dprintk("CHOOSE returns %d\n", outpos);
+@@ -654,6 +658,7 @@ int crush_do_rule(const struct crush_map
+ 	__u32 step;
+ 	int i, j;
+ 	int numrep;
++	int out_size;
+ 	/*
+ 	 * the original choose_total_tries value was off by one (it
+ 	 * counted "retries" and not "tries").  add one.
+@@ -761,6 +766,7 @@ int crush_do_rule(const struct crush_map
+ 						x, numrep,
+ 						curstep->arg2,
+ 						o+osize, j,
++						result_max-osize,
+ 						choose_tries,
+ 						recurse_tries,
+ 						choose_local_retries,
+@@ -770,11 +776,13 @@ int crush_do_rule(const struct crush_map
+ 						c+osize,
+ 						0);
+ 				} else {
++					out_size = ((numrep < (result_max-osize)) ?
++                                                    numrep : (result_max-osize));
+ 					crush_choose_indep(
+ 						map,
+ 						map->buckets[-1-w[i]],
+ 						weight, weight_max,
+-						x, numrep, numrep,
++						x, out_size, numrep,
+ 						curstep->arg2,
+ 						o+osize, j,
+ 						choose_tries,
+@@ -783,7 +791,7 @@ int crush_do_rule(const struct crush_map
+ 						recurse_to_leaf,
+ 						c+osize,
+ 						0);
+-					osize += numrep;
++					osize += out_size;
+ 				}
+ 			}
+