1 From: Mikulas Patocka <mpatocka@redhat.com>
2 Subject: dm table: rework reference counting
5 Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
7 [PATCH 3/3] for bnc 457205.
9 mainline commit d58168763f74d1edbc296d7038c60efe6493fdd4
12 Rework table reference counting.
14 The existing code uses a reference counter. When the last reference is
15 dropped and the counter reaches zero, the table destructor is called.
16 Table reference counters are acquired/released from upcalls from other
17 kernel code (dm_any_congested, dm_merge_bvec, dm_unplug_all).
18 If the reference counter reaches zero in one of the upcalls, the table
19 destructor is called from almost random kernel code.
21 This leads to various problems:
22 * dm_any_congested being called under a spinlock, which calls the
23 destructor, which calls some sleeping function.
24 * the destructor attempting to take a lock that is already taken by the
26 * stale reference from some other kernel code keeps the table
27 constructed, which keeps some devices open, even after successful
28 return from "dmsetup remove". This can confuse lvm and prevent closing
29 of underlying devices or reusing device minor numbers.
31 The patch changes reference counting so that the table destructor can be
32 called only at predetermined places.
34 The table has always exactly one reference from either mapped_device->map
35 or hash_cell->new_map. After this patch, this reference is not counted
36 in table->holders. A pair of dm_create_table/dm_destroy_table functions
37 is used for table creation/destruction.
39 Temporary references from the other code increase table->holders. A pair
40 of dm_table_get/dm_table_put functions is used to manipulate it.
42 When the table is about to be destroyed, we wait for table->holders to
43 reach 0. Then, we call the table destructor. We use active waiting with
44 msleep(1), because the situation happens rarely (to one user in 5 years)
45 and removing the device isn't performance-critical task: the user doesn't
46 care if it takes one tick more or not.
48 This way, the destructor is called only at specific points
49 (dm_table_destroy function) and the above problems associated with lazy
50 destruction can't happen.
52 Finally remove the temporary protection added to dm_any_congested().
54 Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
55 Signed-off-by: Alasdair G Kergon <agk@redhat.com>
58 drivers/md/dm-ioctl.c | 10 ++++------
59 drivers/md/dm-table.c | 28 +++++++++++++++++++++++-----
60 drivers/md/dm.c | 14 +++++---------
62 4 files changed, 33 insertions(+), 20 deletions(-)
66 @@ -1644,8 +1644,6 @@ static int dm_any_congested(void *conges
67 struct mapped_device *md = (struct mapped_device *) congested_data;
70 - atomic_inc(&md->pending);
72 if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
73 map = dm_get_table(md);
75 @@ -1662,10 +1660,6 @@ static int dm_any_congested(void *conges
79 - if (!atomic_dec_return(&md->pending))
80 - /* nudge anyone waiting on suspend queue */
86 @@ -1926,10 +1920,12 @@ static int __bind(struct mapped_device *
88 if (md->suspended_bdev)
93 + dm_table_destroy(t);
98 dm_table_event_callback(t, event_callback, md);
101 @@ -1967,7 +1963,7 @@ static void __unbind(struct mapped_devic
102 write_lock(&md->map_lock);
104 write_unlock(&md->map_lock);
106 + dm_table_destroy(map);
110 --- a/drivers/md/dm.h
111 +++ b/drivers/md/dm.h
112 @@ -46,6 +46,7 @@ struct dm_table;
113 /*-----------------------------------------------------------------
114 * Internal table functions.
115 *---------------------------------------------------------------*/
116 +void dm_table_destroy(struct dm_table *t);
117 void dm_table_event_callback(struct dm_table *t,
118 void (*fn)(void *), void *context);
119 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
120 --- a/drivers/md/dm-ioctl.c
121 +++ b/drivers/md/dm-ioctl.c
122 @@ -233,7 +233,7 @@ static void __hash_remove(struct hash_ce
126 - dm_table_put(hc->new_map);
127 + dm_table_destroy(hc->new_map);
131 @@ -828,8 +828,8 @@ static int do_resume(struct dm_ioctl *pa
133 r = dm_swap_table(md, new_map);
135 + dm_table_destroy(new_map);
137 - dm_table_put(new_map);
141 @@ -837,8 +837,6 @@ static int do_resume(struct dm_ioctl *pa
142 set_disk_ro(dm_disk(md), 0);
144 set_disk_ro(dm_disk(md), 1);
146 - dm_table_put(new_map);
149 if (dm_suspended(md))
150 @@ -1094,7 +1092,7 @@ static int table_load(struct dm_ioctl *p
154 - dm_table_put(hc->new_map);
155 + dm_table_destroy(hc->new_map);
157 up_write(&_hash_lock);
159 @@ -1123,7 +1121,7 @@ static int table_clear(struct dm_ioctl *
163 - dm_table_put(hc->new_map);
164 + dm_table_destroy(hc->new_map);
168 --- a/drivers/md/dm-table.c
169 +++ b/drivers/md/dm-table.c
172 * Copyright (C) 2001 Sistina Software (UK) Limited.
173 - * Copyright (C) 2004 Red Hat, Inc. All rights reserved.
174 + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
176 * This file is released under the GPL.
179 #include <linux/slab.h>
180 #include <linux/interrupt.h>
181 #include <linux/mutex.h>
182 +#include <linux/delay.h>
183 #include <asm/atomic.h>
185 #define DM_MSG_PREFIX "table"
187 #define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
188 #define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
191 + * The table has always exactly one reference from either mapped_device->map
192 + * or hash_cell->new_map. This reference is not counted in table->holders.
193 + * A pair of dm_create_table/dm_destroy_table functions is used for table
194 + * creation/destruction.
196 + * Temporary references from the other code increase table->holders. A pair
197 + * of dm_table_get/dm_table_put functions is used to manipulate it.
199 + * When the table is about to be destroyed, we wait for table->holders to
204 struct mapped_device *md;
206 @@ -231,7 +245,7 @@ int dm_table_create(struct dm_table **re
209 INIT_LIST_HEAD(&t->devices);
210 - atomic_set(&t->holders, 1);
211 + atomic_set(&t->holders, 0);
214 num_targets = KEYS_PER_NODE;
215 @@ -260,10 +274,14 @@ static void free_devices(struct list_hea
219 -static void table_destroy(struct dm_table *t)
220 +void dm_table_destroy(struct dm_table *t)
224 + while (atomic_read(&t->holders))
228 /* free the indexes (see dm_table_complete) */
230 vfree(t->index[t->depth - 2]);
231 @@ -301,8 +319,8 @@ void dm_table_put(struct dm_table *t)
235 - if (atomic_dec_and_test(&t->holders))
237 + smp_mb__before_atomic_dec();
238 + atomic_dec(&t->holders);