1 From a5049a8ae34950249a7ae94c385d7c5c98914412 Mon Sep 17 00:00:00 2001
2 From: Tejun Heo <tj@kernel.org>
3 Date: Thu, 19 Jun 2014 17:42:57 -0400
4 Subject: blkcg: fix use-after-free in __blkg_release_rcu() by making blkcg_gq refcnt an atomic_t
6 From: Tejun Heo <tj@kernel.org>
8 commit a5049a8ae34950249a7ae94c385d7c5c98914412 upstream.
12 So, this patch should do. Joe, Vivek, can one of you guys please
13 verify that the oops goes away with this patch?
15 Jens, the original thread can be read at
17 http://thread.gmane.org/gmane.linux.kernel/1720729
19 The fix converts blkg->refcnt from int to atomic_t. It does some
20 overhead but it should be minute compared to everything else which is
21 going on and the involved cacheline bouncing, so I think it's highly
22 unlikely to cause any noticeable difference. Also, the refcnt in
23 question should be converted to a perpcu_ref for blk-mq anyway, so the
24 atomic_t is likely to go away pretty soon anyway.
29 __blkg_release_rcu() may be invoked after the associated request_queue
30 is released with a RCU grace period inbetween. As such, the function
31 and callbacks invoked from it must not dereference the associated
32 request_queue. This is clearly indicated in the comment above the
35 Unfortunately, while trying to fix a different issue, 2a4fd070ee85
36 ("blkcg: move bulk of blkcg_gq release operations to the RCU
37 callback") ignored this and added [un]locking of @blkg->q->queue_lock
38 to __blkg_release_rcu(). This of course can cause oops as the
39 request_queue may be long gone by the time this code gets executed.
41 general protection fault: 0000 [#1] SMP
42 CPU: 21 PID: 30 Comm: rcuos/21 Not tainted 3.15.0 #1
43 Hardware name: Stratus ftServer 6400/G7LAZ, BIOS BIOS Version 6.3:57 12/25/2013
44 task: ffff880854021de0 ti: ffff88085403c000 task.ti: ffff88085403c000
45 RIP: 0010:[<ffffffff8162e9e5>] [<ffffffff8162e9e5>] _raw_spin_lock_irq+0x15/0x60
46 RSP: 0018:ffff88085403fdf0 EFLAGS: 00010086
47 RAX: 0000000000020000 RBX: 0000000000000010 RCX: 0000000000000000
48 RDX: 000060ef80008248 RSI: 0000000000000286 RDI: 6b6b6b6b6b6b6b6b
49 RBP: ffff88085403fdf0 R08: 0000000000000286 R09: 0000000000009f39
50 R10: 0000000000020001 R11: 0000000000020001 R12: ffff88103c17a130
51 R13: ffff88103c17a080 R14: 0000000000000000 R15: 0000000000000000
52 FS: 0000000000000000(0000) GS:ffff88107fca0000(0000) knlGS:0000000000000000
53 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
54 CR2: 00000000006e5ab8 CR3: 000000000193d000 CR4: 00000000000407e0
56 ffff88085403fe18 ffffffff812cbfc2 ffff88103c17a130 0000000000000000
57 ffff88103c17a130 ffff88085403fec0 ffffffff810d1d28 ffff880854021de0
58 ffff880854021de0 ffff88107fcaec58 ffff88085403fe80 ffff88107fcaec30
60 [<ffffffff812cbfc2>] __blkg_release_rcu+0x72/0x150
61 [<ffffffff810d1d28>] rcu_nocb_kthread+0x1e8/0x300
62 [<ffffffff81091d81>] kthread+0xe1/0x100
63 [<ffffffff8163813c>] ret_from_fork+0x7c/0xb0
64 Code: ff 47 04 48 8b 7d 08 be 00 02 00 00 e8 55 48 a4 ff 5d c3 0f 1f 00 66 66 66 66 90 55 48 89 e5
65 +fa 66 66 90 66 66 90 b8 00 00 02 00 <f0> 0f c1 07 89 c2 c1 ea 10 66 39 c2 75 02 5d c3 83 e2 fe 0f
67 RIP [<ffffffff8162e9e5>] _raw_spin_lock_irq+0x15/0x60
68 RSP <ffff88085403fdf0>
70 The request_queue locking was added because blkcg_gq->refcnt is an int
71 protected with the queue lock and __blkg_release_rcu() needs to put
72 the parent. Let's fix it by making blkcg_gq->refcnt an atomic_t and
73 dropping queue locking in the function.
75 Given the general heavy weight of the current request_queue and blkcg
76 operations, this is unlikely to cause any noticeable overhead.
77 Moreover, blkcg_gq->refcnt is likely to be converted to percpu_ref in
78 the near future, so whatever (most likely negligible) overhead it may
81 Signed-off-by: Tejun Heo <tj@kernel.org>
82 Reported-by: Joe Lawrence <joe.lawrence@stratus.com>
83 Acked-by: Vivek Goyal <vgoyal@redhat.com>
84 Link: http://lkml.kernel.org/g/alpine.DEB.2.02.1406081816540.17948@jlaw-desktop.mno.stratus.com
85 Signed-off-by: Jens Axboe <axboe@fb.com>
86 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
89 block/blk-cgroup.c | 7 ++-----
90 block/blk-cgroup.h | 17 +++++++----------
91 2 files changed, 9 insertions(+), 15 deletions(-)
93 --- a/block/blk-cgroup.c
94 +++ b/block/blk-cgroup.c
95 @@ -80,7 +80,7 @@ static struct blkcg_gq *blkg_alloc(struc
97 INIT_LIST_HEAD(&blkg->q_node);
100 + atomic_set(&blkg->refcnt, 1);
102 /* root blkg uses @q->root_rl, init rl only for !root blkgs */
103 if (blkcg != &blkcg_root) {
104 @@ -399,11 +399,8 @@ void __blkg_release_rcu(struct rcu_head
106 /* release the blkcg and parent blkg refs this blkg has been holding */
107 css_put(&blkg->blkcg->css);
108 - if (blkg->parent) {
109 - spin_lock_irq(blkg->q->queue_lock);
111 blkg_put(blkg->parent);
112 - spin_unlock_irq(blkg->q->queue_lock);
117 --- a/block/blk-cgroup.h
118 +++ b/block/blk-cgroup.h
120 #include <linux/seq_file.h>
121 #include <linux/radix-tree.h>
122 #include <linux/blkdev.h>
123 +#include <linux/atomic.h>
125 /* Max limits for throttle policy */
126 #define THROTL_IOPS_MAX UINT_MAX
127 @@ -104,7 +105,7 @@ struct blkcg_gq {
128 struct request_list rl;
130 /* reference count */
134 /* is this blkg online? protected by both blkcg and q locks */
136 @@ -253,13 +254,12 @@ static inline int blkg_path(struct blkcg
137 * blkg_get - get a blkg reference
140 - * The caller should be holding queue_lock and an existing reference.
141 + * The caller should be holding an existing reference.
143 static inline void blkg_get(struct blkcg_gq *blkg)
145 - lockdep_assert_held(blkg->q->queue_lock);
146 - WARN_ON_ONCE(!blkg->refcnt);
148 + WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
149 + atomic_inc(&blkg->refcnt);
152 void __blkg_release_rcu(struct rcu_head *rcu);
153 @@ -267,14 +267,11 @@ void __blkg_release_rcu(struct rcu_head
155 * blkg_put - put a blkg reference
158 - * The caller should be holding queue_lock.
160 static inline void blkg_put(struct blkcg_gq *blkg)
162 - lockdep_assert_held(blkg->q->queue_lock);
163 - WARN_ON_ONCE(blkg->refcnt <= 0);
164 - if (!--blkg->refcnt)
165 + WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
166 + if (atomic_dec_and_test(&blkg->refcnt))
167 call_rcu(&blkg->rcu_head, __blkg_release_rcu);