--- /dev/null
+From 0048b4837affd153897ed1222283492070027aa9 Mon Sep 17 00:00:00 2001
+From: Ming Lei <ming.lei@canonical.com>
+Date: Sun, 9 Aug 2015 03:41:51 -0400
+Subject: blk-mq: fix race between timeout and freeing request
+
+From: Ming Lei <ming.lei@canonical.com>
+
+commit 0048b4837affd153897ed1222283492070027aa9 upstream.
+
+Inside timeout handler, blk_mq_tag_to_rq() is called
+to retrieve the request from one tag. This way is obviously
+wrong because the request can be freed any time and some
+fiedds of the request can't be trusted, then kernel oops
+might be triggered[1].
+
+Currently wrt. blk_mq_tag_to_rq(), the only special case is
+that the flush request can share same tag with the request
+cloned from, and the two requests can't be active at the same
+time, so this patch fixes the above issue by updating tags->rqs[tag]
+with the active request(either flush rq or the request cloned
+from) of the tag.
+
+Also blk_mq_tag_to_rq() gets much simplified with this patch.
+
+Given blk_mq_tag_to_rq() is mainly for drivers and the caller must
+make sure the request can't be freed, so in bt_for_each() this
+helper is replaced with tags->rqs[tag].
+
+[1] kernel oops log
+[ 439.696220] BUG: unable to handle kernel NULL pointer dereference at 0000000000000158^M
+[ 439.697162] IP: [<ffffffff812d89ba>] blk_mq_tag_to_rq+0x21/0x6e^M
+[ 439.700653] PGD 7ef765067 PUD 7ef764067 PMD 0 ^M
+[ 439.700653] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC ^M
+[ 439.700653] Dumping ftrace buffer:^M
+[ 439.700653] (ftrace buffer empty)^M
+[ 439.700653] Modules linked in: nbd ipv6 kvm_intel kvm serio_raw^M
+[ 439.700653] CPU: 6 PID: 2779 Comm: stress-ng-sigfd Not tainted 4.2.0-rc5-next-20150805+ #265^M
+[ 439.730500] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011^M
+[ 439.730500] task: ffff880605308000 ti: ffff88060530c000 task.ti: ffff88060530c000^M
+[ 439.730500] RIP: 0010:[<ffffffff812d89ba>] [<ffffffff812d89ba>] blk_mq_tag_to_rq+0x21/0x6e^M
+[ 439.730500] RSP: 0018:ffff880819203da0 EFLAGS: 00010283^M
+[ 439.730500] RAX: ffff880811b0e000 RBX: ffff8800bb465f00 RCX: 0000000000000002^M
+[ 439.730500] RDX: 0000000000000000 RSI: 0000000000000202 RDI: 0000000000000000^M
+[ 439.730500] RBP: ffff880819203db0 R08: 0000000000000002 R09: 0000000000000000^M
+[ 439.730500] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000202^M
+[ 439.730500] R13: ffff880814104800 R14: 0000000000000002 R15: ffff880811a2ea00^M
+[ 439.730500] FS: 00007f165b3f5740(0000) GS:ffff880819200000(0000) knlGS:0000000000000000^M
+[ 439.730500] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b^M
+[ 439.730500] CR2: 0000000000000158 CR3: 00000007ef766000 CR4: 00000000000006e0^M
+[ 439.730500] Stack:^M
+[ 439.730500] 0000000000000008 ffff8808114eed90 ffff880819203e00 ffffffff812dc104^M
+[ 439.755663] ffff880819203e40 ffffffff812d9f5e 0000020000000000 ffff8808114eed80^M
+[ 439.755663] Call Trace:^M
+[ 439.755663] <IRQ> ^M
+[ 439.755663] [<ffffffff812dc104>] bt_for_each+0x6e/0xc8^M
+[ 439.755663] [<ffffffff812d9f5e>] ? blk_mq_rq_timed_out+0x6a/0x6a^M
+[ 439.755663] [<ffffffff812d9f5e>] ? blk_mq_rq_timed_out+0x6a/0x6a^M
+[ 439.755663] [<ffffffff812dc1b3>] blk_mq_tag_busy_iter+0x55/0x5e^M
+[ 439.755663] [<ffffffff812d88b4>] ? blk_mq_bio_to_request+0x38/0x38^M
+[ 439.755663] [<ffffffff812d8911>] blk_mq_rq_timer+0x5d/0xd4^M
+[ 439.755663] [<ffffffff810a3e10>] call_timer_fn+0xf7/0x284^M
+[ 439.755663] [<ffffffff810a3d1e>] ? call_timer_fn+0x5/0x284^M
+[ 439.755663] [<ffffffff812d88b4>] ? blk_mq_bio_to_request+0x38/0x38^M
+[ 439.755663] [<ffffffff810a46d6>] run_timer_softirq+0x1ce/0x1f8^M
+[ 439.755663] [<ffffffff8104c367>] __do_softirq+0x181/0x3a4^M
+[ 439.755663] [<ffffffff8104c76e>] irq_exit+0x40/0x94^M
+[ 439.755663] [<ffffffff81031482>] smp_apic_timer_interrupt+0x33/0x3e^M
+[ 439.755663] [<ffffffff815559a4>] apic_timer_interrupt+0x84/0x90^M
+[ 439.755663] <EOI> ^M
+[ 439.755663] [<ffffffff81554350>] ? _raw_spin_unlock_irq+0x32/0x4a^M
+[ 439.755663] [<ffffffff8106a98b>] finish_task_switch+0xe0/0x163^M
+[ 439.755663] [<ffffffff8106a94d>] ? finish_task_switch+0xa2/0x163^M
+[ 439.755663] [<ffffffff81550066>] __schedule+0x469/0x6cd^M
+[ 439.755663] [<ffffffff8155039b>] schedule+0x82/0x9a^M
+[ 439.789267] [<ffffffff8119b28b>] signalfd_read+0x186/0x49a^M
+[ 439.790911] [<ffffffff8106d86a>] ? wake_up_q+0x47/0x47^M
+[ 439.790911] [<ffffffff811618c2>] __vfs_read+0x28/0x9f^M
+[ 439.790911] [<ffffffff8117a289>] ? __fget_light+0x4d/0x74^M
+[ 439.790911] [<ffffffff811620a7>] vfs_read+0x7a/0xc6^M
+[ 439.790911] [<ffffffff8116292b>] SyS_read+0x49/0x7f^M
+[ 439.790911] [<ffffffff81554c17>] entry_SYSCALL_64_fastpath+0x12/0x6f^M
+[ 439.790911] Code: 48 89 e5 e8 a9 b8 e7 ff 5d c3 0f 1f 44 00 00 55 89
+f2 48 89 e5 41 54 41 89 f4 53 48 8b 47 60 48 8b 1c d0 48 8b 7b 30 48 8b
+53 38 <48> 8b 87 58 01 00 00 48 85 c0 75 09 48 8b 97 88 0c 00 00 eb 10
+^M
+[ 439.790911] RIP [<ffffffff812d89ba>] blk_mq_tag_to_rq+0x21/0x6e^M
+[ 439.790911] RSP <ffff880819203da0>^M
+[ 439.790911] CR2: 0000000000000158^M
+[ 439.790911] ---[ end trace d40af58949325661 ]---^M
+
+Signed-off-by: Ming Lei <ming.lei@canonical.com>
+Signed-off-by: Jens Axboe <axboe@fb.com>
+Signed-off-by: Dmitry Shmidt <dimitrysh@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ block/blk-flush.c | 15 ++++++++++++++-
+ block/blk-mq-tag.c | 2 +-
+ block/blk-mq-tag.h | 12 ++++++++++++
+ block/blk-mq.c | 16 +---------------
+ block/blk.h | 6 ++++++
+ 5 files changed, 34 insertions(+), 17 deletions(-)
+
+--- a/block/blk-flush.c
++++ b/block/blk-flush.c
+@@ -73,6 +73,7 @@
+
+ #include "blk.h"
+ #include "blk-mq.h"
++#include "blk-mq-tag.h"
+
+ /* FLUSH/FUA sequences */
+ enum {
+@@ -226,7 +227,12 @@ static void flush_end_io(struct request
+ struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
+
+ if (q->mq_ops) {
++ struct blk_mq_hw_ctx *hctx;
++
++ /* release the tag's ownership to the req cloned from */
+ spin_lock_irqsave(&fq->mq_flush_lock, flags);
++ hctx = q->mq_ops->map_queue(q, flush_rq->mq_ctx->cpu);
++ blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
+ flush_rq->tag = -1;
+ }
+
+@@ -308,11 +314,18 @@ static bool blk_kick_flush(struct reques
+
+ /*
+ * Borrow tag from the first request since they can't
+- * be in flight at the same time.
++ * be in flight at the same time. And acquire the tag's
++ * ownership for flush req.
+ */
+ if (q->mq_ops) {
++ struct blk_mq_hw_ctx *hctx;
++
+ flush_rq->mq_ctx = first_rq->mq_ctx;
+ flush_rq->tag = first_rq->tag;
++ fq->orig_rq = first_rq;
++
++ hctx = q->mq_ops->map_queue(q, first_rq->mq_ctx->cpu);
++ blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
+ }
+
+ flush_rq->cmd_type = REQ_TYPE_FS;
+--- a/block/blk-mq-tag.c
++++ b/block/blk-mq-tag.c
+@@ -403,7 +403,7 @@ static void bt_for_each(struct blk_mq_hw
+ for (bit = find_first_bit(&bm->word, bm->depth);
+ bit < bm->depth;
+ bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
+- rq = blk_mq_tag_to_rq(hctx->tags, off + bit);
++ rq = hctx->tags->rqs[off + bit];
+ if (rq->q == hctx->queue)
+ fn(hctx, rq, data, reserved);
+ }
+--- a/block/blk-mq-tag.h
++++ b/block/blk-mq-tag.h
+@@ -85,4 +85,16 @@ static inline void blk_mq_tag_idle(struc
+ __blk_mq_tag_idle(hctx);
+ }
+
++/*
++ * This helper should only be used for flush request to share tag
++ * with the request cloned from, and both the two requests can't be
++ * in flight at the same time. The caller has to make sure the tag
++ * can't be freed.
++ */
++static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx,
++ unsigned int tag, struct request *rq)
++{
++ hctx->tags->rqs[tag] = rq;
++}
++
+ #endif
+--- a/block/blk-mq.c
++++ b/block/blk-mq.c
+@@ -498,23 +498,9 @@ void blk_mq_kick_requeue_list(struct req
+ }
+ EXPORT_SYMBOL(blk_mq_kick_requeue_list);
+
+-static inline bool is_flush_request(struct request *rq,
+- struct blk_flush_queue *fq, unsigned int tag)
+-{
+- return ((rq->cmd_flags & REQ_FLUSH_SEQ) &&
+- fq->flush_rq->tag == tag);
+-}
+-
+ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
+ {
+- struct request *rq = tags->rqs[tag];
+- /* mq_ctx of flush rq is always cloned from the corresponding req */
+- struct blk_flush_queue *fq = blk_get_flush_queue(rq->q, rq->mq_ctx);
+-
+- if (!is_flush_request(rq, fq, tag))
+- return rq;
+-
+- return fq->flush_rq;
++ return tags->rqs[tag];
+ }
+ EXPORT_SYMBOL(blk_mq_tag_to_rq);
+
+--- a/block/blk.h
++++ b/block/blk.h
+@@ -22,6 +22,12 @@ struct blk_flush_queue {
+ struct list_head flush_queue[2];
+ struct list_head flush_data_in_flight;
+ struct request *flush_rq;
++
++ /*
++ * flush_rq shares tag with this rq, both can't be active
++ * at the same time
++ */
++ struct request *orig_rq;
+ spinlock_t mq_flush_lock;
+ };
+