]> git.ipfire.org Git - people/arne_f/kernel.git/blame - drivers/block/rbd.c
rbd: remove snap_name arg from rbd_add_parse_args()
[people/arne_f/kernel.git] / drivers / block / rbd.c
CommitLineData
602adf40
YS
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
dfc5606d 24 For usage instructions, please refer to:
602adf40 25
dfc5606d 26 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
59c2be1e 34#include <linux/parser.h>
602adf40
YS
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
aafb230e
AE
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
593a9e7b
AE
46/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
df111be6
AE
55/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
f0f8cef5
AE
59#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
602adf40
YS
61
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
d4b125e9
AE
64#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
65#define RBD_MAX_SNAP_NAME_LEN \
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67
35d489f9 68#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
602adf40
YS
69#define RBD_MAX_OPT_LEN 1024
70
71#define RBD_SNAP_HEAD_NAME "-"
72
1e130199
AE
73#define RBD_IMAGE_ID_LEN_MAX 64
74#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 75
d889140c
AE
76/* Feature bits */
77
78#define RBD_FEATURE_LAYERING 1
79
80/* Features supported by this (client software) implementation. */
81
82#define RBD_FEATURES_ALL (0)
83
81a89793
AE
84/*
85 * An RBD device name will be "rbd#", where the "rbd" comes from
86 * RBD_DRV_NAME above, and # is a unique integer identifier.
87 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
88 * enough to hold all possible device names.
89 */
602adf40 90#define DEV_NAME_LEN 32
81a89793 91#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
602adf40 92
cc0538b6 93#define RBD_READ_ONLY_DEFAULT false
59c2be1e 94
602adf40
YS
95/*
96 * block device image metadata (in-memory version)
97 */
98struct rbd_image_header {
f84344f3 99 /* These four fields never change for a given rbd image */
849b4260 100 char *object_prefix;
34b13184 101 u64 features;
602adf40
YS
102 __u8 obj_order;
103 __u8 crypt_type;
104 __u8 comp_type;
602adf40 105
f84344f3
AE
106 /* The remaining fields need to be updated occasionally */
107 u64 image_size;
108 struct ceph_snap_context *snapc;
602adf40
YS
109 char *snap_names;
110 u64 *snap_sizes;
59c2be1e
YS
111
112 u64 obj_version;
113};
114
115struct rbd_options {
cc0538b6 116 bool read_only;
602adf40
YS
117};
118
119/*
f0f8cef5 120 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
121 */
122struct rbd_client {
123 struct ceph_client *client;
124 struct kref kref;
125 struct list_head node;
126};
127
128/*
f0f8cef5 129 * a request completion status
602adf40 130 */
1fec7093
YS
131struct rbd_req_status {
132 int done;
133 int rc;
134 u64 bytes;
135};
136
137/*
138 * a collection of requests
139 */
140struct rbd_req_coll {
141 int total;
142 int num_done;
143 struct kref kref;
144 struct rbd_req_status status[0];
602adf40
YS
145};
146
f0f8cef5
AE
147/*
148 * a single io request
149 */
150struct rbd_request {
151 struct request *rq; /* blk layer request */
152 struct bio *bio; /* cloned bio */
153 struct page **pages; /* list of used pages */
154 u64 len;
155 int coll_index;
156 struct rbd_req_coll *coll;
157};
158
dfc5606d
YS
159struct rbd_snap {
160 struct device dev;
161 const char *name;
3591538f 162 u64 size;
dfc5606d
YS
163 struct list_head node;
164 u64 id;
34b13184 165 u64 features;
dfc5606d
YS
166};
167
f84344f3 168struct rbd_mapping {
99c1f08f 169 u64 size;
34b13184 170 u64 features;
f84344f3
AE
171 bool read_only;
172};
173
602adf40
YS
174/*
175 * a single device
176 */
177struct rbd_device {
de71a297 178 int dev_id; /* blkdev unique id */
602adf40
YS
179
180 int major; /* blkdev assigned major */
181 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 182
a30b71b9 183 u32 image_format; /* Either 1 or 2 */
602adf40
YS
184 struct rbd_client *rbd_client;
185
186 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
187
188 spinlock_t lock; /* queue lock */
189
190 struct rbd_image_header header;
daba5fdb 191 bool exists;
589d30e0
AE
192 char *image_id;
193 size_t image_id_len;
0bed54dc
AE
194 char *image_name;
195 size_t image_name_len;
196 char *header_name;
d22f76e7 197 char *pool_name;
86992098 198 u64 pool_id;
602adf40 199
971f839a
AE
200 char *snap_name;
201 u64 snap_id;
202
59c2be1e
YS
203 struct ceph_osd_event *watch_event;
204 struct ceph_osd_request *watch_request;
205
c666601a
JD
206 /* protects updating the header */
207 struct rw_semaphore header_rwsem;
f84344f3
AE
208
209 struct rbd_mapping mapping;
602adf40
YS
210
211 struct list_head node;
dfc5606d
YS
212
213 /* list of snapshots */
214 struct list_head snaps;
215
216 /* sysfs related */
217 struct device dev;
218};
219
602adf40 220static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
e124a82f 221
602adf40 222static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
223static DEFINE_SPINLOCK(rbd_dev_list_lock);
224
432b8587
AE
225static LIST_HEAD(rbd_client_list); /* clients */
226static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 227
304f6808
AE
228static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
229static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
230
dfc5606d 231static void rbd_dev_release(struct device *dev);
41f38c2b 232static void rbd_remove_snap_dev(struct rbd_snap *snap);
dfc5606d 233
f0f8cef5
AE
234static ssize_t rbd_add(struct bus_type *bus, const char *buf,
235 size_t count);
236static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
237 size_t count);
238
239static struct bus_attribute rbd_bus_attrs[] = {
240 __ATTR(add, S_IWUSR, NULL, rbd_add),
241 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
242 __ATTR_NULL
243};
244
245static struct bus_type rbd_bus_type = {
246 .name = "rbd",
247 .bus_attrs = rbd_bus_attrs,
248};
249
250static void rbd_root_dev_release(struct device *dev)
251{
252}
253
254static struct device rbd_root_dev = {
255 .init_name = "rbd",
256 .release = rbd_root_dev_release,
257};
258
aafb230e
AE
259#ifdef RBD_DEBUG
260#define rbd_assert(expr) \
261 if (unlikely(!(expr))) { \
262 printk(KERN_ERR "\nAssertion failure in %s() " \
263 "at line %d:\n\n" \
264 "\trbd_assert(%s);\n\n", \
265 __func__, __LINE__, #expr); \
266 BUG(); \
267 }
268#else /* !RBD_DEBUG */
269# define rbd_assert(expr) ((void) 0)
270#endif /* !RBD_DEBUG */
dfc5606d 271
dfc5606d
YS
272static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
273{
274 return get_device(&rbd_dev->dev);
275}
276
277static void rbd_put_dev(struct rbd_device *rbd_dev)
278{
279 put_device(&rbd_dev->dev);
280}
602adf40 281
117973fb
AE
282static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
283static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
59c2be1e 284
602adf40
YS
285static int rbd_open(struct block_device *bdev, fmode_t mode)
286{
f0f8cef5 287 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
602adf40 288
f84344f3 289 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
602adf40
YS
290 return -EROFS;
291
340c7a2b 292 rbd_get_dev(rbd_dev);
f84344f3 293 set_device_ro(bdev, rbd_dev->mapping.read_only);
340c7a2b 294
602adf40
YS
295 return 0;
296}
297
dfc5606d
YS
298static int rbd_release(struct gendisk *disk, fmode_t mode)
299{
300 struct rbd_device *rbd_dev = disk->private_data;
301
302 rbd_put_dev(rbd_dev);
303
304 return 0;
305}
306
602adf40
YS
307static const struct block_device_operations rbd_bd_ops = {
308 .owner = THIS_MODULE,
309 .open = rbd_open,
dfc5606d 310 .release = rbd_release,
602adf40
YS
311};
312
313/*
314 * Initialize an rbd client instance.
43ae4701 315 * We own *ceph_opts.
602adf40 316 */
f8c38929 317static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
318{
319 struct rbd_client *rbdc;
320 int ret = -ENOMEM;
321
322 dout("rbd_client_create\n");
323 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
324 if (!rbdc)
325 goto out_opt;
326
327 kref_init(&rbdc->kref);
328 INIT_LIST_HEAD(&rbdc->node);
329
bc534d86
AE
330 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
331
43ae4701 332 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
602adf40 333 if (IS_ERR(rbdc->client))
bc534d86 334 goto out_mutex;
43ae4701 335 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
336
337 ret = ceph_open_session(rbdc->client);
338 if (ret < 0)
339 goto out_err;
340
432b8587 341 spin_lock(&rbd_client_list_lock);
602adf40 342 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 343 spin_unlock(&rbd_client_list_lock);
602adf40 344
bc534d86
AE
345 mutex_unlock(&ctl_mutex);
346
602adf40
YS
347 dout("rbd_client_create created %p\n", rbdc);
348 return rbdc;
349
350out_err:
351 ceph_destroy_client(rbdc->client);
bc534d86
AE
352out_mutex:
353 mutex_unlock(&ctl_mutex);
602adf40
YS
354 kfree(rbdc);
355out_opt:
43ae4701
AE
356 if (ceph_opts)
357 ceph_destroy_options(ceph_opts);
28f259b7 358 return ERR_PTR(ret);
602adf40
YS
359}
360
361/*
1f7ba331
AE
362 * Find a ceph client with specific addr and configuration. If
363 * found, bump its reference count.
602adf40 364 */
1f7ba331 365static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
366{
367 struct rbd_client *client_node;
1f7ba331 368 bool found = false;
602adf40 369
43ae4701 370 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
371 return NULL;
372
1f7ba331
AE
373 spin_lock(&rbd_client_list_lock);
374 list_for_each_entry(client_node, &rbd_client_list, node) {
375 if (!ceph_compare_options(ceph_opts, client_node->client)) {
376 kref_get(&client_node->kref);
377 found = true;
378 break;
379 }
380 }
381 spin_unlock(&rbd_client_list_lock);
382
383 return found ? client_node : NULL;
602adf40
YS
384}
385
59c2be1e
YS
386/*
387 * mount options
388 */
389enum {
59c2be1e
YS
390 Opt_last_int,
391 /* int args above */
392 Opt_last_string,
393 /* string args above */
cc0538b6
AE
394 Opt_read_only,
395 Opt_read_write,
396 /* Boolean args above */
397 Opt_last_bool,
59c2be1e
YS
398};
399
43ae4701 400static match_table_t rbd_opts_tokens = {
59c2be1e
YS
401 /* int args above */
402 /* string args above */
be466c1c 403 {Opt_read_only, "read_only"},
cc0538b6
AE
404 {Opt_read_only, "ro"}, /* Alternate spelling */
405 {Opt_read_write, "read_write"},
406 {Opt_read_write, "rw"}, /* Alternate spelling */
407 /* Boolean args above */
59c2be1e
YS
408 {-1, NULL}
409};
410
411static int parse_rbd_opts_token(char *c, void *private)
412{
43ae4701 413 struct rbd_options *rbd_opts = private;
59c2be1e
YS
414 substring_t argstr[MAX_OPT_ARGS];
415 int token, intval, ret;
416
43ae4701 417 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
418 if (token < 0)
419 return -EINVAL;
420
421 if (token < Opt_last_int) {
422 ret = match_int(&argstr[0], &intval);
423 if (ret < 0) {
424 pr_err("bad mount option arg (not int) "
425 "at '%s'\n", c);
426 return ret;
427 }
428 dout("got int token %d val %d\n", token, intval);
429 } else if (token > Opt_last_int && token < Opt_last_string) {
430 dout("got string token %d val %s\n", token,
431 argstr[0].from);
cc0538b6
AE
432 } else if (token > Opt_last_string && token < Opt_last_bool) {
433 dout("got Boolean token %d\n", token);
59c2be1e
YS
434 } else {
435 dout("got token %d\n", token);
436 }
437
438 switch (token) {
cc0538b6
AE
439 case Opt_read_only:
440 rbd_opts->read_only = true;
441 break;
442 case Opt_read_write:
443 rbd_opts->read_only = false;
444 break;
59c2be1e 445 default:
aafb230e
AE
446 rbd_assert(false);
447 break;
59c2be1e
YS
448 }
449 return 0;
450}
451
602adf40
YS
452/*
453 * Get a ceph client with specific addr and configuration, if one does
454 * not exist create it.
455 */
78cea76e
AE
456static int rbd_get_client(struct rbd_device *rbd_dev,
457 struct ceph_options *ceph_opts)
602adf40 458{
f8c38929 459 struct rbd_client *rbdc;
59c2be1e 460
1f7ba331 461 rbdc = rbd_client_find(ceph_opts);
602adf40 462 if (rbdc) {
602adf40 463 /* using an existing client */
43ae4701 464 ceph_destroy_options(ceph_opts);
f8c38929
AE
465 } else {
466 rbdc = rbd_client_create(ceph_opts);
467 if (IS_ERR(rbdc))
468 return PTR_ERR(rbdc);
602adf40 469 }
f8c38929 470 rbd_dev->rbd_client = rbdc;
602adf40 471
f8c38929 472 return 0;
602adf40
YS
473}
474
475/*
476 * Destroy ceph client
d23a4b3f 477 *
432b8587 478 * Caller must hold rbd_client_list_lock.
602adf40
YS
479 */
480static void rbd_client_release(struct kref *kref)
481{
482 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
483
484 dout("rbd_release_client %p\n", rbdc);
cd9d9f5d 485 spin_lock(&rbd_client_list_lock);
602adf40 486 list_del(&rbdc->node);
cd9d9f5d 487 spin_unlock(&rbd_client_list_lock);
602adf40
YS
488
489 ceph_destroy_client(rbdc->client);
490 kfree(rbdc);
491}
492
493/*
494 * Drop reference to ceph client node. If it's not referenced anymore, release
495 * it.
496 */
497static void rbd_put_client(struct rbd_device *rbd_dev)
498{
499 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
500 rbd_dev->rbd_client = NULL;
602adf40
YS
501}
502
1fec7093
YS
503/*
504 * Destroy requests collection
505 */
506static void rbd_coll_release(struct kref *kref)
507{
508 struct rbd_req_coll *coll =
509 container_of(kref, struct rbd_req_coll, kref);
510
511 dout("rbd_coll_release %p\n", coll);
512 kfree(coll);
513}
602adf40 514
a30b71b9
AE
515static bool rbd_image_format_valid(u32 image_format)
516{
517 return image_format == 1 || image_format == 2;
518}
519
8e94af8e
AE
520static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
521{
103a150f
AE
522 size_t size;
523 u32 snap_count;
524
525 /* The header has to start with the magic rbd header text */
526 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
527 return false;
528
db2388b6
AE
529 /* The bio layer requires at least sector-sized I/O */
530
531 if (ondisk->options.order < SECTOR_SHIFT)
532 return false;
533
534 /* If we use u64 in a few spots we may be able to loosen this */
535
536 if (ondisk->options.order > 8 * sizeof (int) - 1)
537 return false;
538
103a150f
AE
539 /*
540 * The size of a snapshot header has to fit in a size_t, and
541 * that limits the number of snapshots.
542 */
543 snap_count = le32_to_cpu(ondisk->snap_count);
544 size = SIZE_MAX - sizeof (struct ceph_snap_context);
545 if (snap_count > size / sizeof (__le64))
546 return false;
547
548 /*
549 * Not only that, but the size of the entire the snapshot
550 * header must also be representable in a size_t.
551 */
552 size -= snap_count * sizeof (__le64);
553 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
554 return false;
555
556 return true;
8e94af8e
AE
557}
558
602adf40
YS
559/*
560 * Create a new header structure, translate header format from the on-disk
561 * header.
562 */
563static int rbd_header_from_disk(struct rbd_image_header *header,
4156d998 564 struct rbd_image_header_ondisk *ondisk)
602adf40 565{
ccece235 566 u32 snap_count;
58c17b0e 567 size_t len;
d2bb24e5 568 size_t size;
621901d6 569 u32 i;
602adf40 570
6a52325f
AE
571 memset(header, 0, sizeof (*header));
572
103a150f
AE
573 snap_count = le32_to_cpu(ondisk->snap_count);
574
58c17b0e
AE
575 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
576 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6a52325f 577 if (!header->object_prefix)
602adf40 578 return -ENOMEM;
58c17b0e
AE
579 memcpy(header->object_prefix, ondisk->object_prefix, len);
580 header->object_prefix[len] = '\0';
00f1f36f 581
602adf40 582 if (snap_count) {
f785cc1d
AE
583 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
584
621901d6
AE
585 /* Save a copy of the snapshot names */
586
f785cc1d
AE
587 if (snap_names_len > (u64) SIZE_MAX)
588 return -EIO;
589 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
602adf40 590 if (!header->snap_names)
6a52325f 591 goto out_err;
f785cc1d
AE
592 /*
593 * Note that rbd_dev_v1_header_read() guarantees
594 * the ondisk buffer we're working with has
595 * snap_names_len bytes beyond the end of the
596 * snapshot id array, this memcpy() is safe.
597 */
598 memcpy(header->snap_names, &ondisk->snaps[snap_count],
599 snap_names_len);
6a52325f 600
621901d6
AE
601 /* Record each snapshot's size */
602
d2bb24e5
AE
603 size = snap_count * sizeof (*header->snap_sizes);
604 header->snap_sizes = kmalloc(size, GFP_KERNEL);
602adf40 605 if (!header->snap_sizes)
6a52325f 606 goto out_err;
621901d6
AE
607 for (i = 0; i < snap_count; i++)
608 header->snap_sizes[i] =
609 le64_to_cpu(ondisk->snaps[i].image_size);
602adf40 610 } else {
ccece235 611 WARN_ON(ondisk->snap_names_len);
602adf40
YS
612 header->snap_names = NULL;
613 header->snap_sizes = NULL;
614 }
849b4260 615
34b13184 616 header->features = 0; /* No features support in v1 images */
602adf40
YS
617 header->obj_order = ondisk->options.order;
618 header->crypt_type = ondisk->options.crypt_type;
619 header->comp_type = ondisk->options.comp_type;
6a52325f 620
621901d6
AE
621 /* Allocate and fill in the snapshot context */
622
f84344f3 623 header->image_size = le64_to_cpu(ondisk->image_size);
6a52325f
AE
624 size = sizeof (struct ceph_snap_context);
625 size += snap_count * sizeof (header->snapc->snaps[0]);
626 header->snapc = kzalloc(size, GFP_KERNEL);
627 if (!header->snapc)
628 goto out_err;
602adf40
YS
629
630 atomic_set(&header->snapc->nref, 1);
505cbb9b 631 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 632 header->snapc->num_snaps = snap_count;
621901d6
AE
633 for (i = 0; i < snap_count; i++)
634 header->snapc->snaps[i] =
635 le64_to_cpu(ondisk->snaps[i].id);
602adf40
YS
636
637 return 0;
638
6a52325f 639out_err:
849b4260 640 kfree(header->snap_sizes);
ccece235 641 header->snap_sizes = NULL;
602adf40 642 kfree(header->snap_names);
ccece235 643 header->snap_names = NULL;
6a52325f
AE
644 kfree(header->object_prefix);
645 header->object_prefix = NULL;
ccece235 646
00f1f36f 647 return -ENOMEM;
602adf40
YS
648}
649
8836b995 650static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
602adf40 651{
602adf40 652
e86924a8 653 struct rbd_snap *snap;
602adf40 654
e86924a8
AE
655 list_for_each_entry(snap, &rbd_dev->snaps, node) {
656 if (!strcmp(snap_name, snap->name)) {
971f839a 657 rbd_dev->snap_id = snap->id;
e86924a8 658 rbd_dev->mapping.size = snap->size;
34b13184 659 rbd_dev->mapping.features = snap->features;
602adf40 660
e86924a8 661 return 0;
00f1f36f 662 }
00f1f36f 663 }
e86924a8 664
00f1f36f 665 return -ENOENT;
602adf40
YS
666}
667
819d52bf 668static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
602adf40 669{
78dc447d 670 int ret;
602adf40 671
819d52bf 672 if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
cc9d734c 673 sizeof (RBD_SNAP_HEAD_NAME))) {
971f839a 674 rbd_dev->snap_id = CEPH_NOSNAP;
99c1f08f 675 rbd_dev->mapping.size = rbd_dev->header.image_size;
34b13184 676 rbd_dev->mapping.features = rbd_dev->header.features;
e86924a8 677 ret = 0;
602adf40 678 } else {
819d52bf 679 ret = snap_by_name(rbd_dev, rbd_dev->snap_name);
602adf40
YS
680 if (ret < 0)
681 goto done;
f84344f3 682 rbd_dev->mapping.read_only = true;
602adf40 683 }
daba5fdb 684 rbd_dev->exists = true;
602adf40 685done:
602adf40
YS
686 return ret;
687}
688
689static void rbd_header_free(struct rbd_image_header *header)
690{
849b4260 691 kfree(header->object_prefix);
d78fd7ae 692 header->object_prefix = NULL;
602adf40 693 kfree(header->snap_sizes);
d78fd7ae 694 header->snap_sizes = NULL;
849b4260 695 kfree(header->snap_names);
d78fd7ae 696 header->snap_names = NULL;
d1d25646 697 ceph_put_snap_context(header->snapc);
d78fd7ae 698 header->snapc = NULL;
602adf40
YS
699}
700
65ccfe21 701static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
602adf40 702{
65ccfe21
AE
703 char *name;
704 u64 segment;
705 int ret;
602adf40 706
65ccfe21
AE
707 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
708 if (!name)
709 return NULL;
710 segment = offset >> rbd_dev->header.obj_order;
711 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
712 rbd_dev->header.object_prefix, segment);
713 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
714 pr_err("error formatting segment name for #%llu (%d)\n",
715 segment, ret);
716 kfree(name);
717 name = NULL;
718 }
602adf40 719
65ccfe21
AE
720 return name;
721}
602adf40 722
65ccfe21
AE
723static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
724{
725 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
602adf40 726
65ccfe21
AE
727 return offset & (segment_size - 1);
728}
729
730static u64 rbd_segment_length(struct rbd_device *rbd_dev,
731 u64 offset, u64 length)
732{
733 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
734
735 offset &= segment_size - 1;
736
aafb230e 737 rbd_assert(length <= U64_MAX - offset);
65ccfe21
AE
738 if (offset + length > segment_size)
739 length = segment_size - offset;
740
741 return length;
602adf40
YS
742}
743
1fec7093
YS
744static int rbd_get_num_segments(struct rbd_image_header *header,
745 u64 ofs, u64 len)
746{
df111be6
AE
747 u64 start_seg;
748 u64 end_seg;
749
750 if (!len)
751 return 0;
752 if (len - 1 > U64_MAX - ofs)
753 return -ERANGE;
754
755 start_seg = ofs >> header->obj_order;
756 end_seg = (ofs + len - 1) >> header->obj_order;
757
1fec7093
YS
758 return end_seg - start_seg + 1;
759}
760
029bcbd8
JD
761/*
762 * returns the size of an object in the image
763 */
764static u64 rbd_obj_bytes(struct rbd_image_header *header)
765{
766 return 1 << header->obj_order;
767}
768
602adf40
YS
769/*
770 * bio helpers
771 */
772
773static void bio_chain_put(struct bio *chain)
774{
775 struct bio *tmp;
776
777 while (chain) {
778 tmp = chain;
779 chain = chain->bi_next;
780 bio_put(tmp);
781 }
782}
783
784/*
785 * zeros a bio chain, starting at specific offset
786 */
787static void zero_bio_chain(struct bio *chain, int start_ofs)
788{
789 struct bio_vec *bv;
790 unsigned long flags;
791 void *buf;
792 int i;
793 int pos = 0;
794
795 while (chain) {
796 bio_for_each_segment(bv, chain, i) {
797 if (pos + bv->bv_len > start_ofs) {
798 int remainder = max(start_ofs - pos, 0);
799 buf = bvec_kmap_irq(bv, &flags);
800 memset(buf + remainder, 0,
801 bv->bv_len - remainder);
85b5aaa6 802 bvec_kunmap_irq(buf, &flags);
602adf40
YS
803 }
804 pos += bv->bv_len;
805 }
806
807 chain = chain->bi_next;
808 }
809}
810
811/*
f7760dad
AE
812 * Clone a portion of a bio, starting at the given byte offset
813 * and continuing for the number of bytes indicated.
602adf40 814 */
f7760dad
AE
815static struct bio *bio_clone_range(struct bio *bio_src,
816 unsigned int offset,
817 unsigned int len,
818 gfp_t gfpmask)
602adf40 819{
f7760dad
AE
820 struct bio_vec *bv;
821 unsigned int resid;
822 unsigned short idx;
823 unsigned int voff;
824 unsigned short end_idx;
825 unsigned short vcnt;
826 struct bio *bio;
827
828 /* Handle the easy case for the caller */
829
830 if (!offset && len == bio_src->bi_size)
831 return bio_clone(bio_src, gfpmask);
832
833 if (WARN_ON_ONCE(!len))
834 return NULL;
835 if (WARN_ON_ONCE(len > bio_src->bi_size))
836 return NULL;
837 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
838 return NULL;
839
840 /* Find first affected segment... */
841
842 resid = offset;
843 __bio_for_each_segment(bv, bio_src, idx, 0) {
844 if (resid < bv->bv_len)
845 break;
846 resid -= bv->bv_len;
602adf40 847 }
f7760dad 848 voff = resid;
602adf40 849
f7760dad 850 /* ...and the last affected segment */
602adf40 851
f7760dad
AE
852 resid += len;
853 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
854 if (resid <= bv->bv_len)
855 break;
856 resid -= bv->bv_len;
857 }
858 vcnt = end_idx - idx + 1;
859
860 /* Build the clone */
861
862 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
863 if (!bio)
864 return NULL; /* ENOMEM */
602adf40 865
f7760dad
AE
866 bio->bi_bdev = bio_src->bi_bdev;
867 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
868 bio->bi_rw = bio_src->bi_rw;
869 bio->bi_flags |= 1 << BIO_CLONED;
870
871 /*
872 * Copy over our part of the bio_vec, then update the first
873 * and last (or only) entries.
874 */
875 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
876 vcnt * sizeof (struct bio_vec));
877 bio->bi_io_vec[0].bv_offset += voff;
878 if (vcnt > 1) {
879 bio->bi_io_vec[0].bv_len -= voff;
880 bio->bi_io_vec[vcnt - 1].bv_len = resid;
881 } else {
882 bio->bi_io_vec[0].bv_len = len;
602adf40
YS
883 }
884
f7760dad
AE
885 bio->bi_vcnt = vcnt;
886 bio->bi_size = len;
887 bio->bi_idx = 0;
888
889 return bio;
890}
891
892/*
893 * Clone a portion of a bio chain, starting at the given byte offset
894 * into the first bio in the source chain and continuing for the
895 * number of bytes indicated. The result is another bio chain of
896 * exactly the given length, or a null pointer on error.
897 *
898 * The bio_src and offset parameters are both in-out. On entry they
899 * refer to the first source bio and the offset into that bio where
900 * the start of data to be cloned is located.
901 *
902 * On return, bio_src is updated to refer to the bio in the source
903 * chain that contains first un-cloned byte, and *offset will
904 * contain the offset of that byte within that bio.
905 */
906static struct bio *bio_chain_clone_range(struct bio **bio_src,
907 unsigned int *offset,
908 unsigned int len,
909 gfp_t gfpmask)
910{
911 struct bio *bi = *bio_src;
912 unsigned int off = *offset;
913 struct bio *chain = NULL;
914 struct bio **end;
915
916 /* Build up a chain of clone bios up to the limit */
917
918 if (!bi || off >= bi->bi_size || !len)
919 return NULL; /* Nothing to clone */
602adf40 920
f7760dad
AE
921 end = &chain;
922 while (len) {
923 unsigned int bi_size;
924 struct bio *bio;
925
926 if (!bi)
927 goto out_err; /* EINVAL; ran out of bio's */
928 bi_size = min_t(unsigned int, bi->bi_size - off, len);
929 bio = bio_clone_range(bi, off, bi_size, gfpmask);
930 if (!bio)
931 goto out_err; /* ENOMEM */
932
933 *end = bio;
934 end = &bio->bi_next;
602adf40 935
f7760dad
AE
936 off += bi_size;
937 if (off == bi->bi_size) {
938 bi = bi->bi_next;
939 off = 0;
940 }
941 len -= bi_size;
942 }
943 *bio_src = bi;
944 *offset = off;
945
946 return chain;
947out_err:
948 bio_chain_put(chain);
602adf40 949
602adf40
YS
950 return NULL;
951}
952
953/*
954 * helpers for osd request op vectors.
955 */
57cfc106
AE
956static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
957 int opcode, u32 payload_len)
602adf40 958{
57cfc106
AE
959 struct ceph_osd_req_op *ops;
960
961 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
962 if (!ops)
963 return NULL;
964
965 ops[0].op = opcode;
966
602adf40
YS
967 /*
968 * op extent offset and length will be set later on
969 * in calc_raw_layout()
970 */
57cfc106
AE
971 ops[0].payload_len = payload_len;
972
973 return ops;
602adf40
YS
974}
975
976static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
977{
978 kfree(ops);
979}
980
1fec7093
YS
981static void rbd_coll_end_req_index(struct request *rq,
982 struct rbd_req_coll *coll,
983 int index,
984 int ret, u64 len)
985{
986 struct request_queue *q;
987 int min, max, i;
988
bd919d45
AE
989 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
990 coll, index, ret, (unsigned long long) len);
1fec7093
YS
991
992 if (!rq)
993 return;
994
995 if (!coll) {
996 blk_end_request(rq, ret, len);
997 return;
998 }
999
1000 q = rq->q;
1001
1002 spin_lock_irq(q->queue_lock);
1003 coll->status[index].done = 1;
1004 coll->status[index].rc = ret;
1005 coll->status[index].bytes = len;
1006 max = min = coll->num_done;
1007 while (max < coll->total && coll->status[max].done)
1008 max++;
1009
1010 for (i = min; i<max; i++) {
1011 __blk_end_request(rq, coll->status[i].rc,
1012 coll->status[i].bytes);
1013 coll->num_done++;
1014 kref_put(&coll->kref, rbd_coll_release);
1015 }
1016 spin_unlock_irq(q->queue_lock);
1017}
1018
1019static void rbd_coll_end_req(struct rbd_request *req,
1020 int ret, u64 len)
1021{
1022 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
1023}
1024
602adf40
YS
1025/*
1026 * Send ceph osd request
1027 */
1028static int rbd_do_request(struct request *rq,
0ce1a794 1029 struct rbd_device *rbd_dev,
602adf40
YS
1030 struct ceph_snap_context *snapc,
1031 u64 snapid,
aded07ea 1032 const char *object_name, u64 ofs, u64 len,
602adf40
YS
1033 struct bio *bio,
1034 struct page **pages,
1035 int num_pages,
1036 int flags,
1037 struct ceph_osd_req_op *ops,
1fec7093
YS
1038 struct rbd_req_coll *coll,
1039 int coll_index,
602adf40 1040 void (*rbd_cb)(struct ceph_osd_request *req,
59c2be1e
YS
1041 struct ceph_msg *msg),
1042 struct ceph_osd_request **linger_req,
1043 u64 *ver)
602adf40
YS
1044{
1045 struct ceph_osd_request *req;
1046 struct ceph_file_layout *layout;
1047 int ret;
1048 u64 bno;
1049 struct timespec mtime = CURRENT_TIME;
1050 struct rbd_request *req_data;
1051 struct ceph_osd_request_head *reqhead;
1dbb4399 1052 struct ceph_osd_client *osdc;
602adf40 1053
602adf40 1054 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
1fec7093
YS
1055 if (!req_data) {
1056 if (coll)
1057 rbd_coll_end_req_index(rq, coll, coll_index,
1058 -ENOMEM, len);
1059 return -ENOMEM;
1060 }
1061
1062 if (coll) {
1063 req_data->coll = coll;
1064 req_data->coll_index = coll_index;
1065 }
602adf40 1066
f7760dad
AE
1067 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1068 object_name, (unsigned long long) ofs,
1069 (unsigned long long) len, coll, coll_index);
602adf40 1070
0ce1a794 1071 osdc = &rbd_dev->rbd_client->client->osdc;
1dbb4399
AE
1072 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1073 false, GFP_NOIO, pages, bio);
4ad12621 1074 if (!req) {
4ad12621 1075 ret = -ENOMEM;
602adf40
YS
1076 goto done_pages;
1077 }
1078
1079 req->r_callback = rbd_cb;
1080
1081 req_data->rq = rq;
1082 req_data->bio = bio;
1083 req_data->pages = pages;
1084 req_data->len = len;
1085
1086 req->r_priv = req_data;
1087
1088 reqhead = req->r_request->front.iov_base;
1089 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1090
aded07ea 1091 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
602adf40
YS
1092 req->r_oid_len = strlen(req->r_oid);
1093
1094 layout = &req->r_file_layout;
1095 memset(layout, 0, sizeof(*layout));
1096 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1097 layout->fl_stripe_count = cpu_to_le32(1);
1098 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
86992098 1099 layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->pool_id);
6cae3717
SW
1100 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1101 req, ops);
1102 rbd_assert(ret == 0);
602adf40
YS
1103
1104 ceph_osdc_build_request(req, ofs, &len,
1105 ops,
1106 snapc,
1107 &mtime,
1108 req->r_oid, req->r_oid_len);
602adf40 1109
59c2be1e 1110 if (linger_req) {
1dbb4399 1111 ceph_osdc_set_request_linger(osdc, req);
59c2be1e
YS
1112 *linger_req = req;
1113 }
1114
1dbb4399 1115 ret = ceph_osdc_start_request(osdc, req, false);
602adf40
YS
1116 if (ret < 0)
1117 goto done_err;
1118
1119 if (!rbd_cb) {
1dbb4399 1120 ret = ceph_osdc_wait_request(osdc, req);
59c2be1e
YS
1121 if (ver)
1122 *ver = le64_to_cpu(req->r_reassert_version.version);
bd919d45
AE
1123 dout("reassert_ver=%llu\n",
1124 (unsigned long long)
1125 le64_to_cpu(req->r_reassert_version.version));
602adf40
YS
1126 ceph_osdc_put_request(req);
1127 }
1128 return ret;
1129
1130done_err:
1131 bio_chain_put(req_data->bio);
1132 ceph_osdc_put_request(req);
1133done_pages:
1fec7093 1134 rbd_coll_end_req(req_data, ret, len);
602adf40 1135 kfree(req_data);
602adf40
YS
1136 return ret;
1137}
1138
1139/*
1140 * Ceph osd op callback
1141 */
1142static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1143{
1144 struct rbd_request *req_data = req->r_priv;
1145 struct ceph_osd_reply_head *replyhead;
1146 struct ceph_osd_op *op;
1147 __s32 rc;
1148 u64 bytes;
1149 int read_op;
1150
1151 /* parse reply */
1152 replyhead = msg->front.iov_base;
1153 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1154 op = (void *)(replyhead + 1);
1155 rc = le32_to_cpu(replyhead->result);
1156 bytes = le64_to_cpu(op->extent.length);
895cfcc8 1157 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
602adf40 1158
bd919d45
AE
1159 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1160 (unsigned long long) bytes, read_op, (int) rc);
602adf40
YS
1161
1162 if (rc == -ENOENT && read_op) {
1163 zero_bio_chain(req_data->bio, 0);
1164 rc = 0;
1165 } else if (rc == 0 && read_op && bytes < req_data->len) {
1166 zero_bio_chain(req_data->bio, bytes);
1167 bytes = req_data->len;
1168 }
1169
1fec7093 1170 rbd_coll_end_req(req_data, rc, bytes);
602adf40
YS
1171
1172 if (req_data->bio)
1173 bio_chain_put(req_data->bio);
1174
1175 ceph_osdc_put_request(req);
1176 kfree(req_data);
1177}
1178
59c2be1e
YS
1179static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1180{
1181 ceph_osdc_put_request(req);
1182}
1183
602adf40
YS
1184/*
1185 * Do a synchronous ceph osd operation
1186 */
0ce1a794 1187static int rbd_req_sync_op(struct rbd_device *rbd_dev,
602adf40
YS
1188 struct ceph_snap_context *snapc,
1189 u64 snapid,
602adf40 1190 int flags,
913d2fdc 1191 struct ceph_osd_req_op *ops,
aded07ea 1192 const char *object_name,
f8d4de6e
AE
1193 u64 ofs, u64 inbound_size,
1194 char *inbound,
59c2be1e
YS
1195 struct ceph_osd_request **linger_req,
1196 u64 *ver)
602adf40
YS
1197{
1198 int ret;
1199 struct page **pages;
1200 int num_pages;
913d2fdc 1201
aafb230e 1202 rbd_assert(ops != NULL);
602adf40 1203
f8d4de6e 1204 num_pages = calc_pages_for(ofs, inbound_size);
602adf40 1205 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
b8d0638a
DC
1206 if (IS_ERR(pages))
1207 return PTR_ERR(pages);
602adf40 1208
0ce1a794 1209 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
f8d4de6e 1210 object_name, ofs, inbound_size, NULL,
602adf40
YS
1211 pages, num_pages,
1212 flags,
1213 ops,
1fec7093 1214 NULL, 0,
59c2be1e
YS
1215 NULL,
1216 linger_req, ver);
602adf40 1217 if (ret < 0)
913d2fdc 1218 goto done;
602adf40 1219
f8d4de6e
AE
1220 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1221 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
602adf40 1222
602adf40
YS
1223done:
1224 ceph_release_page_vector(pages, num_pages);
1225 return ret;
1226}
1227
1228/*
1229 * Do an asynchronous ceph osd operation
1230 */
1231static int rbd_do_op(struct request *rq,
0ce1a794 1232 struct rbd_device *rbd_dev,
602adf40 1233 struct ceph_snap_context *snapc,
602adf40 1234 u64 ofs, u64 len,
1fec7093
YS
1235 struct bio *bio,
1236 struct rbd_req_coll *coll,
1237 int coll_index)
602adf40
YS
1238{
1239 char *seg_name;
1240 u64 seg_ofs;
1241 u64 seg_len;
1242 int ret;
1243 struct ceph_osd_req_op *ops;
1244 u32 payload_len;
ff2e4bb5
AE
1245 int opcode;
1246 int flags;
4634246d 1247 u64 snapid;
602adf40 1248
65ccfe21 1249 seg_name = rbd_segment_name(rbd_dev, ofs);
602adf40
YS
1250 if (!seg_name)
1251 return -ENOMEM;
65ccfe21
AE
1252 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1253 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
602adf40 1254
ff2e4bb5
AE
1255 if (rq_data_dir(rq) == WRITE) {
1256 opcode = CEPH_OSD_OP_WRITE;
1257 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
4634246d 1258 snapid = CEPH_NOSNAP;
ff2e4bb5
AE
1259 payload_len = seg_len;
1260 } else {
1261 opcode = CEPH_OSD_OP_READ;
1262 flags = CEPH_OSD_FLAG_READ;
4634246d 1263 snapc = NULL;
971f839a 1264 snapid = rbd_dev->snap_id;
ff2e4bb5
AE
1265 payload_len = 0;
1266 }
602adf40 1267
57cfc106
AE
1268 ret = -ENOMEM;
1269 ops = rbd_create_rw_ops(1, opcode, payload_len);
1270 if (!ops)
602adf40
YS
1271 goto done;
1272
1273 /* we've taken care of segment sizes earlier when we
1274 cloned the bios. We should never have a segment
1275 truncated at this point */
aafb230e 1276 rbd_assert(seg_len == len);
602adf40
YS
1277
1278 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1279 seg_name, seg_ofs, seg_len,
1280 bio,
1281 NULL, 0,
1282 flags,
1283 ops,
1fec7093 1284 coll, coll_index,
59c2be1e 1285 rbd_req_cb, 0, NULL);
11f77002
SW
1286
1287 rbd_destroy_ops(ops);
602adf40
YS
1288done:
1289 kfree(seg_name);
1290 return ret;
1291}
1292
602adf40
YS
1293/*
1294 * Request sync osd read
1295 */
0ce1a794 1296static int rbd_req_sync_read(struct rbd_device *rbd_dev,
602adf40 1297 u64 snapid,
aded07ea 1298 const char *object_name,
602adf40 1299 u64 ofs, u64 len,
59c2be1e
YS
1300 char *buf,
1301 u64 *ver)
602adf40 1302{
913d2fdc
AE
1303 struct ceph_osd_req_op *ops;
1304 int ret;
1305
1306 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1307 if (!ops)
1308 return -ENOMEM;
1309
1310 ret = rbd_req_sync_op(rbd_dev, NULL,
b06e6a6b 1311 snapid,
602adf40 1312 CEPH_OSD_FLAG_READ,
913d2fdc
AE
1313 ops, object_name, ofs, len, buf, NULL, ver);
1314 rbd_destroy_ops(ops);
1315
1316 return ret;
602adf40
YS
1317}
1318
1319/*
59c2be1e
YS
1320 * Request sync osd watch
1321 */
0ce1a794 1322static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
59c2be1e 1323 u64 ver,
7f0a24d8 1324 u64 notify_id)
59c2be1e
YS
1325{
1326 struct ceph_osd_req_op *ops;
11f77002
SW
1327 int ret;
1328
57cfc106
AE
1329 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1330 if (!ops)
1331 return -ENOMEM;
59c2be1e 1332
a71b891b 1333 ops[0].watch.ver = cpu_to_le64(ver);
59c2be1e
YS
1334 ops[0].watch.cookie = notify_id;
1335 ops[0].watch.flag = 0;
1336
0ce1a794 1337 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
7f0a24d8 1338 rbd_dev->header_name, 0, 0, NULL,
ad4f232f 1339 NULL, 0,
59c2be1e
YS
1340 CEPH_OSD_FLAG_READ,
1341 ops,
1fec7093 1342 NULL, 0,
59c2be1e
YS
1343 rbd_simple_req_cb, 0, NULL);
1344
1345 rbd_destroy_ops(ops);
1346 return ret;
1347}
1348
1349static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1350{
0ce1a794 1351 struct rbd_device *rbd_dev = (struct rbd_device *)data;
a71b891b 1352 u64 hver;
13143d2d
SW
1353 int rc;
1354
0ce1a794 1355 if (!rbd_dev)
59c2be1e
YS
1356 return;
1357
bd919d45
AE
1358 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1359 rbd_dev->header_name, (unsigned long long) notify_id,
1360 (unsigned int) opcode);
117973fb 1361 rc = rbd_dev_refresh(rbd_dev, &hver);
13143d2d 1362 if (rc)
f0f8cef5 1363 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
0ce1a794 1364 " update snaps: %d\n", rbd_dev->major, rc);
59c2be1e 1365
7f0a24d8 1366 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
59c2be1e
YS
1367}
1368
1369/*
1370 * Request sync osd watch
1371 */
0e6f322d 1372static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
59c2be1e
YS
1373{
1374 struct ceph_osd_req_op *ops;
0ce1a794 1375 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
57cfc106 1376 int ret;
59c2be1e 1377
57cfc106
AE
1378 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1379 if (!ops)
1380 return -ENOMEM;
59c2be1e
YS
1381
1382 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
0ce1a794 1383 (void *)rbd_dev, &rbd_dev->watch_event);
59c2be1e
YS
1384 if (ret < 0)
1385 goto fail;
1386
0e6f322d 1387 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
0ce1a794 1388 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
59c2be1e
YS
1389 ops[0].watch.flag = 1;
1390
0ce1a794 1391 ret = rbd_req_sync_op(rbd_dev, NULL,
59c2be1e 1392 CEPH_NOSNAP,
59c2be1e
YS
1393 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1394 ops,
0e6f322d
AE
1395 rbd_dev->header_name,
1396 0, 0, NULL,
0ce1a794 1397 &rbd_dev->watch_request, NULL);
59c2be1e
YS
1398
1399 if (ret < 0)
1400 goto fail_event;
1401
1402 rbd_destroy_ops(ops);
1403 return 0;
1404
1405fail_event:
0ce1a794
AE
1406 ceph_osdc_cancel_event(rbd_dev->watch_event);
1407 rbd_dev->watch_event = NULL;
59c2be1e
YS
1408fail:
1409 rbd_destroy_ops(ops);
1410 return ret;
1411}
1412
79e3057c
YS
1413/*
1414 * Request sync osd unwatch
1415 */
070c633f 1416static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
79e3057c
YS
1417{
1418 struct ceph_osd_req_op *ops;
57cfc106 1419 int ret;
79e3057c 1420
57cfc106
AE
1421 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1422 if (!ops)
1423 return -ENOMEM;
79e3057c
YS
1424
1425 ops[0].watch.ver = 0;
0ce1a794 1426 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
79e3057c
YS
1427 ops[0].watch.flag = 0;
1428
0ce1a794 1429 ret = rbd_req_sync_op(rbd_dev, NULL,
79e3057c 1430 CEPH_NOSNAP,
79e3057c
YS
1431 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1432 ops,
070c633f
AE
1433 rbd_dev->header_name,
1434 0, 0, NULL, NULL, NULL);
1435
79e3057c
YS
1436
1437 rbd_destroy_ops(ops);
0ce1a794
AE
1438 ceph_osdc_cancel_event(rbd_dev->watch_event);
1439 rbd_dev->watch_event = NULL;
79e3057c
YS
1440 return ret;
1441}
1442
602adf40 1443/*
3cb4a687 1444 * Synchronous osd object method call
602adf40 1445 */
0ce1a794 1446static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
aded07ea
AE
1447 const char *object_name,
1448 const char *class_name,
1449 const char *method_name,
3cb4a687
AE
1450 const char *outbound,
1451 size_t outbound_size,
f8d4de6e
AE
1452 char *inbound,
1453 size_t inbound_size,
3cb4a687 1454 int flags,
59c2be1e 1455 u64 *ver)
602adf40
YS
1456{
1457 struct ceph_osd_req_op *ops;
aded07ea
AE
1458 int class_name_len = strlen(class_name);
1459 int method_name_len = strlen(method_name);
3cb4a687 1460 int payload_size;
57cfc106
AE
1461 int ret;
1462
3cb4a687
AE
1463 /*
1464 * Any input parameters required by the method we're calling
1465 * will be sent along with the class and method names as
1466 * part of the message payload. That data and its size are
1467 * supplied via the indata and indata_len fields (named from
1468 * the perspective of the server side) in the OSD request
1469 * operation.
1470 */
1471 payload_size = class_name_len + method_name_len + outbound_size;
1472 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
57cfc106
AE
1473 if (!ops)
1474 return -ENOMEM;
602adf40 1475
aded07ea
AE
1476 ops[0].cls.class_name = class_name;
1477 ops[0].cls.class_len = (__u8) class_name_len;
1478 ops[0].cls.method_name = method_name;
1479 ops[0].cls.method_len = (__u8) method_name_len;
602adf40 1480 ops[0].cls.argc = 0;
3cb4a687
AE
1481 ops[0].cls.indata = outbound;
1482 ops[0].cls.indata_len = outbound_size;
602adf40 1483
0ce1a794 1484 ret = rbd_req_sync_op(rbd_dev, NULL,
602adf40 1485 CEPH_NOSNAP,
3cb4a687 1486 flags, ops,
f8d4de6e
AE
1487 object_name, 0, inbound_size, inbound,
1488 NULL, ver);
602adf40
YS
1489
1490 rbd_destroy_ops(ops);
1491
1492 dout("cls_exec returned %d\n", ret);
1493 return ret;
1494}
1495
1fec7093
YS
1496static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1497{
1498 struct rbd_req_coll *coll =
1499 kzalloc(sizeof(struct rbd_req_coll) +
1500 sizeof(struct rbd_req_status) * num_reqs,
1501 GFP_ATOMIC);
1502
1503 if (!coll)
1504 return NULL;
1505 coll->total = num_reqs;
1506 kref_init(&coll->kref);
1507 return coll;
1508}
1509
602adf40
YS
1510/*
1511 * block device queue callback
1512 */
1513static void rbd_rq_fn(struct request_queue *q)
1514{
1515 struct rbd_device *rbd_dev = q->queuedata;
1516 struct request *rq;
602adf40 1517
00f1f36f 1518 while ((rq = blk_fetch_request(q))) {
602adf40 1519 struct bio *bio;
602adf40 1520 bool do_write;
bd919d45 1521 unsigned int size;
602adf40 1522 u64 ofs;
1fec7093
YS
1523 int num_segs, cur_seg = 0;
1524 struct rbd_req_coll *coll;
d1d25646 1525 struct ceph_snap_context *snapc;
f7760dad 1526 unsigned int bio_offset;
602adf40 1527
602adf40
YS
1528 dout("fetched request\n");
1529
1530 /* filter out block requests we don't understand */
1531 if ((rq->cmd_type != REQ_TYPE_FS)) {
1532 __blk_end_request_all(rq, 0);
00f1f36f 1533 continue;
602adf40
YS
1534 }
1535
1536 /* deduce our operation (read, write) */
1537 do_write = (rq_data_dir(rq) == WRITE);
f84344f3 1538 if (do_write && rbd_dev->mapping.read_only) {
602adf40 1539 __blk_end_request_all(rq, -EROFS);
00f1f36f 1540 continue;
602adf40
YS
1541 }
1542
1543 spin_unlock_irq(q->queue_lock);
1544
d1d25646 1545 down_read(&rbd_dev->header_rwsem);
e88a36ec 1546
daba5fdb
AE
1547 if (!rbd_dev->exists) {
1548 rbd_assert(rbd_dev->snap_id != CEPH_NOSNAP);
e88a36ec 1549 up_read(&rbd_dev->header_rwsem);
d1d25646
JD
1550 dout("request for non-existent snapshot");
1551 spin_lock_irq(q->queue_lock);
1552 __blk_end_request_all(rq, -ENXIO);
1553 continue;
e88a36ec
JD
1554 }
1555
d1d25646
JD
1556 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1557
1558 up_read(&rbd_dev->header_rwsem);
1559
f7760dad
AE
1560 size = blk_rq_bytes(rq);
1561 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1562 bio = rq->bio;
1563
602adf40
YS
1564 dout("%s 0x%x bytes at 0x%llx\n",
1565 do_write ? "write" : "read",
bd919d45 1566 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
602adf40 1567
1fec7093 1568 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
df111be6
AE
1569 if (num_segs <= 0) {
1570 spin_lock_irq(q->queue_lock);
1571 __blk_end_request_all(rq, num_segs);
1572 ceph_put_snap_context(snapc);
1573 continue;
1574 }
1fec7093
YS
1575 coll = rbd_alloc_coll(num_segs);
1576 if (!coll) {
1577 spin_lock_irq(q->queue_lock);
1578 __blk_end_request_all(rq, -ENOMEM);
d1d25646 1579 ceph_put_snap_context(snapc);
00f1f36f 1580 continue;
1fec7093
YS
1581 }
1582
f7760dad 1583 bio_offset = 0;
602adf40 1584 do {
f7760dad
AE
1585 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1586 unsigned int chain_size;
1587 struct bio *bio_chain;
1588
1589 BUG_ON(limit > (u64) UINT_MAX);
1590 chain_size = (unsigned int) limit;
bd919d45 1591 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
f7760dad 1592
1fec7093 1593 kref_get(&coll->kref);
f7760dad
AE
1594
1595 /* Pass a cloned bio chain via an osd request */
1596
1597 bio_chain = bio_chain_clone_range(&bio,
1598 &bio_offset, chain_size,
1599 GFP_ATOMIC);
1600 if (bio_chain)
4634246d 1601 (void) rbd_do_op(rq, rbd_dev, snapc,
f7760dad
AE
1602 ofs, chain_size,
1603 bio_chain, coll, cur_seg);
4634246d 1604 else
1fec7093 1605 rbd_coll_end_req_index(rq, coll, cur_seg,
f7760dad
AE
1606 -ENOMEM, chain_size);
1607 size -= chain_size;
1608 ofs += chain_size;
602adf40 1609
1fec7093 1610 cur_seg++;
602adf40 1611 } while (size > 0);
1fec7093 1612 kref_put(&coll->kref, rbd_coll_release);
602adf40 1613
602adf40 1614 spin_lock_irq(q->queue_lock);
d1d25646
JD
1615
1616 ceph_put_snap_context(snapc);
602adf40
YS
1617 }
1618}
1619
1620/*
1621 * a queue callback. Makes sure that we don't create a bio that spans across
1622 * multiple osd objects. One exception would be with a single page bios,
f7760dad 1623 * which we handle later at bio_chain_clone_range()
602adf40
YS
1624 */
1625static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1626 struct bio_vec *bvec)
1627{
1628 struct rbd_device *rbd_dev = q->queuedata;
e5cfeed2
AE
1629 sector_t sector_offset;
1630 sector_t sectors_per_obj;
1631 sector_t obj_sector_offset;
1632 int ret;
1633
1634 /*
1635 * Find how far into its rbd object the partition-relative
1636 * bio start sector is to offset relative to the enclosing
1637 * device.
1638 */
1639 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1640 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1641 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
1642
1643 /*
1644 * Compute the number of bytes from that offset to the end
1645 * of the object. Account for what's already used by the bio.
1646 */
1647 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1648 if (ret > bmd->bi_size)
1649 ret -= bmd->bi_size;
1650 else
1651 ret = 0;
1652
1653 /*
1654 * Don't send back more than was asked for. And if the bio
1655 * was empty, let the whole thing through because: "Note
1656 * that a block device *must* allow a single page to be
1657 * added to an empty bio."
1658 */
1659 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1660 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1661 ret = (int) bvec->bv_len;
1662
1663 return ret;
602adf40
YS
1664}
1665
1666static void rbd_free_disk(struct rbd_device *rbd_dev)
1667{
1668 struct gendisk *disk = rbd_dev->disk;
1669
1670 if (!disk)
1671 return;
1672
602adf40
YS
1673 if (disk->flags & GENHD_FL_UP)
1674 del_gendisk(disk);
1675 if (disk->queue)
1676 blk_cleanup_queue(disk->queue);
1677 put_disk(disk);
1678}
1679
1680/*
4156d998
AE
1681 * Read the complete header for the given rbd device.
1682 *
1683 * Returns a pointer to a dynamically-allocated buffer containing
1684 * the complete and validated header. Caller can pass the address
1685 * of a variable that will be filled in with the version of the
1686 * header object at the time it was read.
1687 *
1688 * Returns a pointer-coded errno if a failure occurs.
602adf40 1689 */
4156d998
AE
1690static struct rbd_image_header_ondisk *
1691rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
602adf40 1692{
4156d998 1693 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 1694 u32 snap_count = 0;
4156d998
AE
1695 u64 names_size = 0;
1696 u32 want_count;
1697 int ret;
602adf40 1698
00f1f36f 1699 /*
4156d998
AE
1700 * The complete header will include an array of its 64-bit
1701 * snapshot ids, followed by the names of those snapshots as
1702 * a contiguous block of NUL-terminated strings. Note that
1703 * the number of snapshots could change by the time we read
1704 * it in, in which case we re-read it.
00f1f36f 1705 */
4156d998
AE
1706 do {
1707 size_t size;
1708
1709 kfree(ondisk);
1710
1711 size = sizeof (*ondisk);
1712 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1713 size += names_size;
1714 ondisk = kmalloc(size, GFP_KERNEL);
1715 if (!ondisk)
1716 return ERR_PTR(-ENOMEM);
1717
1718 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
0bed54dc 1719 rbd_dev->header_name,
4156d998
AE
1720 0, size,
1721 (char *) ondisk, version);
1722
1723 if (ret < 0)
1724 goto out_err;
1725 if (WARN_ON((size_t) ret < size)) {
1726 ret = -ENXIO;
1727 pr_warning("short header read for image %s"
1728 " (want %zd got %d)\n",
1729 rbd_dev->image_name, size, ret);
1730 goto out_err;
1731 }
1732 if (!rbd_dev_ondisk_valid(ondisk)) {
1733 ret = -ENXIO;
1734 pr_warning("invalid header for image %s\n",
1735 rbd_dev->image_name);
1736 goto out_err;
81e759fb 1737 }
602adf40 1738
4156d998
AE
1739 names_size = le64_to_cpu(ondisk->snap_names_len);
1740 want_count = snap_count;
1741 snap_count = le32_to_cpu(ondisk->snap_count);
1742 } while (snap_count != want_count);
00f1f36f 1743
4156d998 1744 return ondisk;
00f1f36f 1745
4156d998
AE
1746out_err:
1747 kfree(ondisk);
1748
1749 return ERR_PTR(ret);
1750}
1751
1752/*
1753 * reload the ondisk the header
1754 */
1755static int rbd_read_header(struct rbd_device *rbd_dev,
1756 struct rbd_image_header *header)
1757{
1758 struct rbd_image_header_ondisk *ondisk;
1759 u64 ver = 0;
1760 int ret;
602adf40 1761
4156d998
AE
1762 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1763 if (IS_ERR(ondisk))
1764 return PTR_ERR(ondisk);
1765 ret = rbd_header_from_disk(header, ondisk);
1766 if (ret >= 0)
1767 header->obj_version = ver;
1768 kfree(ondisk);
1769
1770 return ret;
602adf40
YS
1771}
1772
41f38c2b 1773static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
dfc5606d
YS
1774{
1775 struct rbd_snap *snap;
a0593290 1776 struct rbd_snap *next;
dfc5606d 1777
a0593290 1778 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
41f38c2b 1779 rbd_remove_snap_dev(snap);
dfc5606d
YS
1780}
1781
9478554a
AE
1782static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1783{
1784 sector_t size;
1785
971f839a 1786 if (rbd_dev->snap_id != CEPH_NOSNAP)
9478554a
AE
1787 return;
1788
1789 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1790 dout("setting size to %llu sectors", (unsigned long long) size);
1791 rbd_dev->mapping.size = (u64) size;
1792 set_capacity(rbd_dev->disk, size);
1793}
1794
602adf40
YS
1795/*
1796 * only read the first part of the ondisk header, without the snaps info
1797 */
117973fb 1798static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
602adf40
YS
1799{
1800 int ret;
1801 struct rbd_image_header h;
602adf40
YS
1802
1803 ret = rbd_read_header(rbd_dev, &h);
1804 if (ret < 0)
1805 return ret;
1806
a51aa0c0
JD
1807 down_write(&rbd_dev->header_rwsem);
1808
9478554a
AE
1809 /* Update image size, and check for resize of mapped image */
1810 rbd_dev->header.image_size = h.image_size;
1811 rbd_update_mapping_size(rbd_dev);
9db4b3e3 1812
849b4260 1813 /* rbd_dev->header.object_prefix shouldn't change */
602adf40 1814 kfree(rbd_dev->header.snap_sizes);
849b4260 1815 kfree(rbd_dev->header.snap_names);
d1d25646
JD
1816 /* osd requests may still refer to snapc */
1817 ceph_put_snap_context(rbd_dev->header.snapc);
602adf40 1818
b813623a
AE
1819 if (hver)
1820 *hver = h.obj_version;
a71b891b 1821 rbd_dev->header.obj_version = h.obj_version;
93a24e08 1822 rbd_dev->header.image_size = h.image_size;
602adf40
YS
1823 rbd_dev->header.snapc = h.snapc;
1824 rbd_dev->header.snap_names = h.snap_names;
1825 rbd_dev->header.snap_sizes = h.snap_sizes;
849b4260
AE
1826 /* Free the extra copy of the object prefix */
1827 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1828 kfree(h.object_prefix);
1829
304f6808
AE
1830 ret = rbd_dev_snaps_update(rbd_dev);
1831 if (!ret)
1832 ret = rbd_dev_snaps_register(rbd_dev);
dfc5606d 1833
c666601a 1834 up_write(&rbd_dev->header_rwsem);
602adf40 1835
dfc5606d 1836 return ret;
602adf40
YS
1837}
1838
117973fb 1839static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
1fe5e993
AE
1840{
1841 int ret;
1842
117973fb 1843 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1fe5e993 1844 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
117973fb
AE
1845 if (rbd_dev->image_format == 1)
1846 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1847 else
1848 ret = rbd_dev_v2_refresh(rbd_dev, hver);
1fe5e993
AE
1849 mutex_unlock(&ctl_mutex);
1850
1851 return ret;
1852}
1853
602adf40
YS
1854static int rbd_init_disk(struct rbd_device *rbd_dev)
1855{
1856 struct gendisk *disk;
1857 struct request_queue *q;
593a9e7b 1858 u64 segment_size;
602adf40 1859
602adf40 1860 /* create gendisk info */
602adf40
YS
1861 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1862 if (!disk)
1fcdb8aa 1863 return -ENOMEM;
602adf40 1864
f0f8cef5 1865 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 1866 rbd_dev->dev_id);
602adf40
YS
1867 disk->major = rbd_dev->major;
1868 disk->first_minor = 0;
1869 disk->fops = &rbd_bd_ops;
1870 disk->private_data = rbd_dev;
1871
1872 /* init rq */
602adf40
YS
1873 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1874 if (!q)
1875 goto out_disk;
029bcbd8 1876
593a9e7b
AE
1877 /* We use the default size, but let's be explicit about it. */
1878 blk_queue_physical_block_size(q, SECTOR_SIZE);
1879
029bcbd8 1880 /* set io sizes to object size */
593a9e7b
AE
1881 segment_size = rbd_obj_bytes(&rbd_dev->header);
1882 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1883 blk_queue_max_segment_size(q, segment_size);
1884 blk_queue_io_min(q, segment_size);
1885 blk_queue_io_opt(q, segment_size);
029bcbd8 1886
602adf40
YS
1887 blk_queue_merge_bvec(q, rbd_merge_bvec);
1888 disk->queue = q;
1889
1890 q->queuedata = rbd_dev;
1891
1892 rbd_dev->disk = disk;
602adf40 1893
12f02944
AE
1894 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1895
602adf40 1896 return 0;
602adf40
YS
1897out_disk:
1898 put_disk(disk);
1fcdb8aa
AE
1899
1900 return -ENOMEM;
602adf40
YS
1901}
1902
dfc5606d
YS
1903/*
1904 sysfs
1905*/
1906
593a9e7b
AE
1907static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1908{
1909 return container_of(dev, struct rbd_device, dev);
1910}
1911
dfc5606d
YS
1912static ssize_t rbd_size_show(struct device *dev,
1913 struct device_attribute *attr, char *buf)
1914{
593a9e7b 1915 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0
JD
1916 sector_t size;
1917
1918 down_read(&rbd_dev->header_rwsem);
1919 size = get_capacity(rbd_dev->disk);
1920 up_read(&rbd_dev->header_rwsem);
dfc5606d 1921
a51aa0c0 1922 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
dfc5606d
YS
1923}
1924
34b13184
AE
1925/*
1926 * Note this shows the features for whatever's mapped, which is not
1927 * necessarily the base image.
1928 */
1929static ssize_t rbd_features_show(struct device *dev,
1930 struct device_attribute *attr, char *buf)
1931{
1932 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1933
1934 return sprintf(buf, "0x%016llx\n",
1935 (unsigned long long) rbd_dev->mapping.features);
1936}
1937
dfc5606d
YS
1938static ssize_t rbd_major_show(struct device *dev,
1939 struct device_attribute *attr, char *buf)
1940{
593a9e7b 1941 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 1942
dfc5606d
YS
1943 return sprintf(buf, "%d\n", rbd_dev->major);
1944}
1945
1946static ssize_t rbd_client_id_show(struct device *dev,
1947 struct device_attribute *attr, char *buf)
602adf40 1948{
593a9e7b 1949 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1950
1dbb4399
AE
1951 return sprintf(buf, "client%lld\n",
1952 ceph_client_id(rbd_dev->rbd_client->client));
602adf40
YS
1953}
1954
dfc5606d
YS
1955static ssize_t rbd_pool_show(struct device *dev,
1956 struct device_attribute *attr, char *buf)
602adf40 1957{
593a9e7b 1958 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d
YS
1959
1960 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1961}
1962
9bb2f334
AE
1963static ssize_t rbd_pool_id_show(struct device *dev,
1964 struct device_attribute *attr, char *buf)
1965{
1966 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1967
86992098 1968 return sprintf(buf, "%llu\n", (unsigned long long) rbd_dev->pool_id);
9bb2f334
AE
1969}
1970
dfc5606d
YS
1971static ssize_t rbd_name_show(struct device *dev,
1972 struct device_attribute *attr, char *buf)
1973{
593a9e7b 1974 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1975
0bed54dc 1976 return sprintf(buf, "%s\n", rbd_dev->image_name);
dfc5606d
YS
1977}
1978
589d30e0
AE
1979static ssize_t rbd_image_id_show(struct device *dev,
1980 struct device_attribute *attr, char *buf)
1981{
1982 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1983
1984 return sprintf(buf, "%s\n", rbd_dev->image_id);
1985}
1986
34b13184
AE
1987/*
1988 * Shows the name of the currently-mapped snapshot (or
1989 * RBD_SNAP_HEAD_NAME for the base image).
1990 */
dfc5606d
YS
1991static ssize_t rbd_snap_show(struct device *dev,
1992 struct device_attribute *attr,
1993 char *buf)
1994{
593a9e7b 1995 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1996
971f839a 1997 return sprintf(buf, "%s\n", rbd_dev->snap_name);
dfc5606d
YS
1998}
1999
2000static ssize_t rbd_image_refresh(struct device *dev,
2001 struct device_attribute *attr,
2002 const char *buf,
2003 size_t size)
2004{
593a9e7b 2005 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 2006 int ret;
602adf40 2007
117973fb 2008 ret = rbd_dev_refresh(rbd_dev, NULL);
b813623a
AE
2009
2010 return ret < 0 ? ret : size;
dfc5606d 2011}
602adf40 2012
dfc5606d 2013static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
34b13184 2014static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
dfc5606d
YS
2015static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2016static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2017static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
9bb2f334 2018static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
dfc5606d 2019static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
589d30e0 2020static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
dfc5606d
YS
2021static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2022static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
dfc5606d
YS
2023
2024static struct attribute *rbd_attrs[] = {
2025 &dev_attr_size.attr,
34b13184 2026 &dev_attr_features.attr,
dfc5606d
YS
2027 &dev_attr_major.attr,
2028 &dev_attr_client_id.attr,
2029 &dev_attr_pool.attr,
9bb2f334 2030 &dev_attr_pool_id.attr,
dfc5606d 2031 &dev_attr_name.attr,
589d30e0 2032 &dev_attr_image_id.attr,
dfc5606d
YS
2033 &dev_attr_current_snap.attr,
2034 &dev_attr_refresh.attr,
dfc5606d
YS
2035 NULL
2036};
2037
2038static struct attribute_group rbd_attr_group = {
2039 .attrs = rbd_attrs,
2040};
2041
2042static const struct attribute_group *rbd_attr_groups[] = {
2043 &rbd_attr_group,
2044 NULL
2045};
2046
2047static void rbd_sysfs_dev_release(struct device *dev)
2048{
2049}
2050
2051static struct device_type rbd_device_type = {
2052 .name = "rbd",
2053 .groups = rbd_attr_groups,
2054 .release = rbd_sysfs_dev_release,
2055};
2056
2057
2058/*
2059 sysfs - snapshots
2060*/
2061
2062static ssize_t rbd_snap_size_show(struct device *dev,
2063 struct device_attribute *attr,
2064 char *buf)
2065{
2066 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2067
3591538f 2068 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
dfc5606d
YS
2069}
2070
2071static ssize_t rbd_snap_id_show(struct device *dev,
2072 struct device_attribute *attr,
2073 char *buf)
2074{
2075 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2076
3591538f 2077 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
dfc5606d
YS
2078}
2079
34b13184
AE
2080static ssize_t rbd_snap_features_show(struct device *dev,
2081 struct device_attribute *attr,
2082 char *buf)
2083{
2084 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2085
2086 return sprintf(buf, "0x%016llx\n",
2087 (unsigned long long) snap->features);
2088}
2089
dfc5606d
YS
2090static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2091static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
34b13184 2092static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
dfc5606d
YS
2093
2094static struct attribute *rbd_snap_attrs[] = {
2095 &dev_attr_snap_size.attr,
2096 &dev_attr_snap_id.attr,
34b13184 2097 &dev_attr_snap_features.attr,
dfc5606d
YS
2098 NULL,
2099};
2100
2101static struct attribute_group rbd_snap_attr_group = {
2102 .attrs = rbd_snap_attrs,
2103};
2104
2105static void rbd_snap_dev_release(struct device *dev)
2106{
2107 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2108 kfree(snap->name);
2109 kfree(snap);
2110}
2111
2112static const struct attribute_group *rbd_snap_attr_groups[] = {
2113 &rbd_snap_attr_group,
2114 NULL
2115};
2116
2117static struct device_type rbd_snap_device_type = {
2118 .groups = rbd_snap_attr_groups,
2119 .release = rbd_snap_dev_release,
2120};
2121
304f6808
AE
2122static bool rbd_snap_registered(struct rbd_snap *snap)
2123{
2124 bool ret = snap->dev.type == &rbd_snap_device_type;
2125 bool reg = device_is_registered(&snap->dev);
2126
2127 rbd_assert(!ret ^ reg);
2128
2129 return ret;
2130}
2131
41f38c2b 2132static void rbd_remove_snap_dev(struct rbd_snap *snap)
dfc5606d
YS
2133{
2134 list_del(&snap->node);
304f6808
AE
2135 if (device_is_registered(&snap->dev))
2136 device_unregister(&snap->dev);
dfc5606d
YS
2137}
2138
14e7085d 2139static int rbd_register_snap_dev(struct rbd_snap *snap,
dfc5606d
YS
2140 struct device *parent)
2141{
2142 struct device *dev = &snap->dev;
2143 int ret;
2144
2145 dev->type = &rbd_snap_device_type;
2146 dev->parent = parent;
2147 dev->release = rbd_snap_dev_release;
d4b125e9 2148 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
304f6808
AE
2149 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2150
dfc5606d
YS
2151 ret = device_register(dev);
2152
2153 return ret;
2154}
2155
4e891e0a 2156static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
c8d18425 2157 const char *snap_name,
34b13184
AE
2158 u64 snap_id, u64 snap_size,
2159 u64 snap_features)
dfc5606d 2160{
4e891e0a 2161 struct rbd_snap *snap;
dfc5606d 2162 int ret;
4e891e0a
AE
2163
2164 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
dfc5606d 2165 if (!snap)
4e891e0a
AE
2166 return ERR_PTR(-ENOMEM);
2167
2168 ret = -ENOMEM;
c8d18425 2169 snap->name = kstrdup(snap_name, GFP_KERNEL);
4e891e0a
AE
2170 if (!snap->name)
2171 goto err;
2172
c8d18425
AE
2173 snap->id = snap_id;
2174 snap->size = snap_size;
34b13184 2175 snap->features = snap_features;
4e891e0a
AE
2176
2177 return snap;
2178
dfc5606d
YS
2179err:
2180 kfree(snap->name);
2181 kfree(snap);
4e891e0a
AE
2182
2183 return ERR_PTR(ret);
dfc5606d
YS
2184}
2185
cd892126
AE
2186static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2187 u64 *snap_size, u64 *snap_features)
2188{
2189 char *snap_name;
2190
2191 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2192
2193 *snap_size = rbd_dev->header.snap_sizes[which];
2194 *snap_features = 0; /* No features for v1 */
2195
2196 /* Skip over names until we find the one we are looking for */
2197
2198 snap_name = rbd_dev->header.snap_names;
2199 while (which--)
2200 snap_name += strlen(snap_name) + 1;
2201
2202 return snap_name;
2203}
2204
9d475de5
AE
2205/*
2206 * Get the size and object order for an image snapshot, or if
2207 * snap_id is CEPH_NOSNAP, gets this information for the base
2208 * image.
2209 */
2210static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2211 u8 *order, u64 *snap_size)
2212{
2213 __le64 snapid = cpu_to_le64(snap_id);
2214 int ret;
2215 struct {
2216 u8 order;
2217 __le64 size;
2218 } __attribute__ ((packed)) size_buf = { 0 };
2219
2220 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2221 "rbd", "get_size",
2222 (char *) &snapid, sizeof (snapid),
2223 (char *) &size_buf, sizeof (size_buf),
2224 CEPH_OSD_FLAG_READ, NULL);
2225 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2226 if (ret < 0)
2227 return ret;
2228
2229 *order = size_buf.order;
2230 *snap_size = le64_to_cpu(size_buf.size);
2231
2232 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2233 (unsigned long long) snap_id, (unsigned int) *order,
2234 (unsigned long long) *snap_size);
2235
2236 return 0;
2237}
2238
2239static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2240{
2241 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2242 &rbd_dev->header.obj_order,
2243 &rbd_dev->header.image_size);
2244}
2245
1e130199
AE
2246static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2247{
2248 void *reply_buf;
2249 int ret;
2250 void *p;
2251
2252 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2253 if (!reply_buf)
2254 return -ENOMEM;
2255
2256 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2257 "rbd", "get_object_prefix",
2258 NULL, 0,
2259 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2260 CEPH_OSD_FLAG_READ, NULL);
2261 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2262 if (ret < 0)
2263 goto out;
a0ea3a40 2264 ret = 0; /* rbd_req_sync_exec() can return positive */
1e130199
AE
2265
2266 p = reply_buf;
2267 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2268 p + RBD_OBJ_PREFIX_LEN_MAX,
2269 NULL, GFP_NOIO);
2270
2271 if (IS_ERR(rbd_dev->header.object_prefix)) {
2272 ret = PTR_ERR(rbd_dev->header.object_prefix);
2273 rbd_dev->header.object_prefix = NULL;
2274 } else {
2275 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2276 }
2277
2278out:
2279 kfree(reply_buf);
2280
2281 return ret;
2282}
2283
b1b5402a
AE
2284static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2285 u64 *snap_features)
2286{
2287 __le64 snapid = cpu_to_le64(snap_id);
2288 struct {
2289 __le64 features;
2290 __le64 incompat;
2291 } features_buf = { 0 };
d889140c 2292 u64 incompat;
b1b5402a
AE
2293 int ret;
2294
2295 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2296 "rbd", "get_features",
2297 (char *) &snapid, sizeof (snapid),
2298 (char *) &features_buf, sizeof (features_buf),
2299 CEPH_OSD_FLAG_READ, NULL);
2300 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2301 if (ret < 0)
2302 return ret;
d889140c
AE
2303
2304 incompat = le64_to_cpu(features_buf.incompat);
2305 if (incompat & ~RBD_FEATURES_ALL)
2306 return -ENOTSUPP;
2307
b1b5402a
AE
2308 *snap_features = le64_to_cpu(features_buf.features);
2309
2310 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2311 (unsigned long long) snap_id,
2312 (unsigned long long) *snap_features,
2313 (unsigned long long) le64_to_cpu(features_buf.incompat));
2314
2315 return 0;
2316}
2317
2318static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2319{
2320 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2321 &rbd_dev->header.features);
2322}
2323
6e14b1a6 2324static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
35d489f9
AE
2325{
2326 size_t size;
2327 int ret;
2328 void *reply_buf;
2329 void *p;
2330 void *end;
2331 u64 seq;
2332 u32 snap_count;
2333 struct ceph_snap_context *snapc;
2334 u32 i;
2335
2336 /*
2337 * We'll need room for the seq value (maximum snapshot id),
2338 * snapshot count, and array of that many snapshot ids.
2339 * For now we have a fixed upper limit on the number we're
2340 * prepared to receive.
2341 */
2342 size = sizeof (__le64) + sizeof (__le32) +
2343 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2344 reply_buf = kzalloc(size, GFP_KERNEL);
2345 if (!reply_buf)
2346 return -ENOMEM;
2347
2348 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2349 "rbd", "get_snapcontext",
2350 NULL, 0,
2351 reply_buf, size,
6e14b1a6 2352 CEPH_OSD_FLAG_READ, ver);
35d489f9
AE
2353 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2354 if (ret < 0)
2355 goto out;
2356
2357 ret = -ERANGE;
2358 p = reply_buf;
2359 end = (char *) reply_buf + size;
2360 ceph_decode_64_safe(&p, end, seq, out);
2361 ceph_decode_32_safe(&p, end, snap_count, out);
2362
2363 /*
2364 * Make sure the reported number of snapshot ids wouldn't go
2365 * beyond the end of our buffer. But before checking that,
2366 * make sure the computed size of the snapshot context we
2367 * allocate is representable in a size_t.
2368 */
2369 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2370 / sizeof (u64)) {
2371 ret = -EINVAL;
2372 goto out;
2373 }
2374 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2375 goto out;
2376
2377 size = sizeof (struct ceph_snap_context) +
2378 snap_count * sizeof (snapc->snaps[0]);
2379 snapc = kmalloc(size, GFP_KERNEL);
2380 if (!snapc) {
2381 ret = -ENOMEM;
2382 goto out;
2383 }
2384
2385 atomic_set(&snapc->nref, 1);
2386 snapc->seq = seq;
2387 snapc->num_snaps = snap_count;
2388 for (i = 0; i < snap_count; i++)
2389 snapc->snaps[i] = ceph_decode_64(&p);
2390
2391 rbd_dev->header.snapc = snapc;
2392
2393 dout(" snap context seq = %llu, snap_count = %u\n",
2394 (unsigned long long) seq, (unsigned int) snap_count);
2395
2396out:
2397 kfree(reply_buf);
2398
2399 return 0;
2400}
2401
b8b1e2db
AE
2402static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2403{
2404 size_t size;
2405 void *reply_buf;
2406 __le64 snap_id;
2407 int ret;
2408 void *p;
2409 void *end;
b8b1e2db
AE
2410 char *snap_name;
2411
2412 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2413 reply_buf = kmalloc(size, GFP_KERNEL);
2414 if (!reply_buf)
2415 return ERR_PTR(-ENOMEM);
2416
2417 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2418 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2419 "rbd", "get_snapshot_name",
2420 (char *) &snap_id, sizeof (snap_id),
2421 reply_buf, size,
2422 CEPH_OSD_FLAG_READ, NULL);
2423 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2424 if (ret < 0)
2425 goto out;
2426
2427 p = reply_buf;
2428 end = (char *) reply_buf + size;
e5c35534 2429 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
b8b1e2db
AE
2430 if (IS_ERR(snap_name)) {
2431 ret = PTR_ERR(snap_name);
2432 goto out;
2433 } else {
2434 dout(" snap_id 0x%016llx snap_name = %s\n",
2435 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2436 }
2437 kfree(reply_buf);
2438
2439 return snap_name;
2440out:
2441 kfree(reply_buf);
2442
2443 return ERR_PTR(ret);
2444}
2445
2446static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2447 u64 *snap_size, u64 *snap_features)
2448{
2449 __le64 snap_id;
2450 u8 order;
2451 int ret;
2452
2453 snap_id = rbd_dev->header.snapc->snaps[which];
2454 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2455 if (ret)
2456 return ERR_PTR(ret);
2457 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2458 if (ret)
2459 return ERR_PTR(ret);
2460
2461 return rbd_dev_v2_snap_name(rbd_dev, which);
2462}
2463
2464static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2465 u64 *snap_size, u64 *snap_features)
2466{
2467 if (rbd_dev->image_format == 1)
2468 return rbd_dev_v1_snap_info(rbd_dev, which,
2469 snap_size, snap_features);
2470 if (rbd_dev->image_format == 2)
2471 return rbd_dev_v2_snap_info(rbd_dev, which,
2472 snap_size, snap_features);
2473 return ERR_PTR(-EINVAL);
2474}
2475
117973fb
AE
2476static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2477{
2478 int ret;
2479 __u8 obj_order;
2480
2481 down_write(&rbd_dev->header_rwsem);
2482
2483 /* Grab old order first, to see if it changes */
2484
2485 obj_order = rbd_dev->header.obj_order,
2486 ret = rbd_dev_v2_image_size(rbd_dev);
2487 if (ret)
2488 goto out;
2489 if (rbd_dev->header.obj_order != obj_order) {
2490 ret = -EIO;
2491 goto out;
2492 }
2493 rbd_update_mapping_size(rbd_dev);
2494
2495 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2496 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2497 if (ret)
2498 goto out;
2499 ret = rbd_dev_snaps_update(rbd_dev);
2500 dout("rbd_dev_snaps_update returned %d\n", ret);
2501 if (ret)
2502 goto out;
2503 ret = rbd_dev_snaps_register(rbd_dev);
2504 dout("rbd_dev_snaps_register returned %d\n", ret);
2505out:
2506 up_write(&rbd_dev->header_rwsem);
2507
2508 return ret;
2509}
2510
dfc5606d 2511/*
35938150
AE
2512 * Scan the rbd device's current snapshot list and compare it to the
2513 * newly-received snapshot context. Remove any existing snapshots
2514 * not present in the new snapshot context. Add a new snapshot for
2515 * any snaphots in the snapshot context not in the current list.
2516 * And verify there are no changes to snapshots we already know
2517 * about.
2518 *
2519 * Assumes the snapshots in the snapshot context are sorted by
2520 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2521 * are also maintained in that order.)
dfc5606d 2522 */
304f6808 2523static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
dfc5606d 2524{
35938150
AE
2525 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2526 const u32 snap_count = snapc->num_snaps;
35938150
AE
2527 struct list_head *head = &rbd_dev->snaps;
2528 struct list_head *links = head->next;
2529 u32 index = 0;
dfc5606d 2530
9fcbb800 2531 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
35938150
AE
2532 while (index < snap_count || links != head) {
2533 u64 snap_id;
2534 struct rbd_snap *snap;
cd892126
AE
2535 char *snap_name;
2536 u64 snap_size = 0;
2537 u64 snap_features = 0;
dfc5606d 2538
35938150
AE
2539 snap_id = index < snap_count ? snapc->snaps[index]
2540 : CEPH_NOSNAP;
2541 snap = links != head ? list_entry(links, struct rbd_snap, node)
2542 : NULL;
aafb230e 2543 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
dfc5606d 2544
35938150
AE
2545 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2546 struct list_head *next = links->next;
dfc5606d 2547
35938150 2548 /* Existing snapshot not in the new snap context */
dfc5606d 2549
971f839a 2550 if (rbd_dev->snap_id == snap->id)
daba5fdb 2551 rbd_dev->exists = false;
41f38c2b 2552 rbd_remove_snap_dev(snap);
9fcbb800 2553 dout("%ssnap id %llu has been removed\n",
971f839a 2554 rbd_dev->snap_id == snap->id ? "mapped " : "",
9fcbb800 2555 (unsigned long long) snap->id);
35938150
AE
2556
2557 /* Done with this list entry; advance */
2558
2559 links = next;
dfc5606d
YS
2560 continue;
2561 }
35938150 2562
b8b1e2db
AE
2563 snap_name = rbd_dev_snap_info(rbd_dev, index,
2564 &snap_size, &snap_features);
cd892126
AE
2565 if (IS_ERR(snap_name))
2566 return PTR_ERR(snap_name);
2567
9fcbb800
AE
2568 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2569 (unsigned long long) snap_id);
35938150
AE
2570 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2571 struct rbd_snap *new_snap;
2572
2573 /* We haven't seen this snapshot before */
2574
c8d18425 2575 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
cd892126 2576 snap_id, snap_size, snap_features);
9fcbb800
AE
2577 if (IS_ERR(new_snap)) {
2578 int err = PTR_ERR(new_snap);
2579
2580 dout(" failed to add dev, error %d\n", err);
2581
2582 return err;
2583 }
35938150
AE
2584
2585 /* New goes before existing, or at end of list */
2586
9fcbb800 2587 dout(" added dev%s\n", snap ? "" : " at end\n");
35938150
AE
2588 if (snap)
2589 list_add_tail(&new_snap->node, &snap->node);
2590 else
523f3258 2591 list_add_tail(&new_snap->node, head);
35938150
AE
2592 } else {
2593 /* Already have this one */
2594
9fcbb800
AE
2595 dout(" already present\n");
2596
cd892126 2597 rbd_assert(snap->size == snap_size);
aafb230e 2598 rbd_assert(!strcmp(snap->name, snap_name));
cd892126 2599 rbd_assert(snap->features == snap_features);
35938150
AE
2600
2601 /* Done with this list entry; advance */
2602
2603 links = links->next;
dfc5606d 2604 }
35938150
AE
2605
2606 /* Advance to the next entry in the snapshot context */
2607
2608 index++;
dfc5606d 2609 }
9fcbb800 2610 dout("%s: done\n", __func__);
dfc5606d
YS
2611
2612 return 0;
2613}
2614
304f6808
AE
2615/*
2616 * Scan the list of snapshots and register the devices for any that
2617 * have not already been registered.
2618 */
2619static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2620{
2621 struct rbd_snap *snap;
2622 int ret = 0;
2623
2624 dout("%s called\n", __func__);
86ff77bb
AE
2625 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2626 return -EIO;
304f6808
AE
2627
2628 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2629 if (!rbd_snap_registered(snap)) {
2630 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2631 if (ret < 0)
2632 break;
2633 }
2634 }
2635 dout("%s: returning %d\n", __func__, ret);
2636
2637 return ret;
2638}
2639
dfc5606d
YS
2640static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2641{
dfc5606d 2642 struct device *dev;
cd789ab9 2643 int ret;
dfc5606d
YS
2644
2645 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
dfc5606d 2646
cd789ab9 2647 dev = &rbd_dev->dev;
dfc5606d
YS
2648 dev->bus = &rbd_bus_type;
2649 dev->type = &rbd_device_type;
2650 dev->parent = &rbd_root_dev;
2651 dev->release = rbd_dev_release;
de71a297 2652 dev_set_name(dev, "%d", rbd_dev->dev_id);
dfc5606d 2653 ret = device_register(dev);
dfc5606d 2654
dfc5606d 2655 mutex_unlock(&ctl_mutex);
cd789ab9 2656
dfc5606d 2657 return ret;
602adf40
YS
2658}
2659
dfc5606d
YS
2660static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2661{
2662 device_unregister(&rbd_dev->dev);
2663}
2664
59c2be1e
YS
2665static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2666{
2667 int ret, rc;
2668
2669 do {
0e6f322d 2670 ret = rbd_req_sync_watch(rbd_dev);
59c2be1e 2671 if (ret == -ERANGE) {
117973fb 2672 rc = rbd_dev_refresh(rbd_dev, NULL);
59c2be1e
YS
2673 if (rc < 0)
2674 return rc;
2675 }
2676 } while (ret == -ERANGE);
2677
2678 return ret;
2679}
2680
e2839308 2681static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
1ddbe94e
AE
2682
2683/*
499afd5b
AE
2684 * Get a unique rbd identifier for the given new rbd_dev, and add
2685 * the rbd_dev to the global list. The minimum rbd id is 1.
1ddbe94e 2686 */
e2839308 2687static void rbd_dev_id_get(struct rbd_device *rbd_dev)
b7f23c36 2688{
e2839308 2689 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
499afd5b
AE
2690
2691 spin_lock(&rbd_dev_list_lock);
2692 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2693 spin_unlock(&rbd_dev_list_lock);
e2839308
AE
2694 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2695 (unsigned long long) rbd_dev->dev_id);
1ddbe94e 2696}
b7f23c36 2697
1ddbe94e 2698/*
499afd5b
AE
2699 * Remove an rbd_dev from the global list, and record that its
2700 * identifier is no longer in use.
1ddbe94e 2701 */
e2839308 2702static void rbd_dev_id_put(struct rbd_device *rbd_dev)
1ddbe94e 2703{
d184f6bf 2704 struct list_head *tmp;
de71a297 2705 int rbd_id = rbd_dev->dev_id;
d184f6bf
AE
2706 int max_id;
2707
aafb230e 2708 rbd_assert(rbd_id > 0);
499afd5b 2709
e2839308
AE
2710 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2711 (unsigned long long) rbd_dev->dev_id);
499afd5b
AE
2712 spin_lock(&rbd_dev_list_lock);
2713 list_del_init(&rbd_dev->node);
d184f6bf
AE
2714
2715 /*
2716 * If the id being "put" is not the current maximum, there
2717 * is nothing special we need to do.
2718 */
e2839308 2719 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
d184f6bf
AE
2720 spin_unlock(&rbd_dev_list_lock);
2721 return;
2722 }
2723
2724 /*
2725 * We need to update the current maximum id. Search the
2726 * list to find out what it is. We're more likely to find
2727 * the maximum at the end, so search the list backward.
2728 */
2729 max_id = 0;
2730 list_for_each_prev(tmp, &rbd_dev_list) {
2731 struct rbd_device *rbd_dev;
2732
2733 rbd_dev = list_entry(tmp, struct rbd_device, node);
b213e0b1
AE
2734 if (rbd_dev->dev_id > max_id)
2735 max_id = rbd_dev->dev_id;
d184f6bf 2736 }
499afd5b 2737 spin_unlock(&rbd_dev_list_lock);
b7f23c36 2738
1ddbe94e 2739 /*
e2839308 2740 * The max id could have been updated by rbd_dev_id_get(), in
d184f6bf
AE
2741 * which case it now accurately reflects the new maximum.
2742 * Be careful not to overwrite the maximum value in that
2743 * case.
1ddbe94e 2744 */
e2839308
AE
2745 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2746 dout(" max dev id has been reset\n");
b7f23c36
AE
2747}
2748
e28fff26
AE
2749/*
2750 * Skips over white space at *buf, and updates *buf to point to the
2751 * first found non-space character (if any). Returns the length of
593a9e7b
AE
2752 * the token (string of non-white space characters) found. Note
2753 * that *buf must be terminated with '\0'.
e28fff26
AE
2754 */
2755static inline size_t next_token(const char **buf)
2756{
2757 /*
2758 * These are the characters that produce nonzero for
2759 * isspace() in the "C" and "POSIX" locales.
2760 */
2761 const char *spaces = " \f\n\r\t\v";
2762
2763 *buf += strspn(*buf, spaces); /* Find start of token */
2764
2765 return strcspn(*buf, spaces); /* Return token length */
2766}
2767
2768/*
2769 * Finds the next token in *buf, and if the provided token buffer is
2770 * big enough, copies the found token into it. The result, if
593a9e7b
AE
2771 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2772 * must be terminated with '\0' on entry.
e28fff26
AE
2773 *
2774 * Returns the length of the token found (not including the '\0').
2775 * Return value will be 0 if no token is found, and it will be >=
2776 * token_size if the token would not fit.
2777 *
593a9e7b 2778 * The *buf pointer will be updated to point beyond the end of the
e28fff26
AE
2779 * found token. Note that this occurs even if the token buffer is
2780 * too small to hold it.
2781 */
2782static inline size_t copy_token(const char **buf,
2783 char *token,
2784 size_t token_size)
2785{
2786 size_t len;
2787
2788 len = next_token(buf);
2789 if (len < token_size) {
2790 memcpy(token, *buf, len);
2791 *(token + len) = '\0';
2792 }
2793 *buf += len;
2794
2795 return len;
2796}
2797
ea3352f4
AE
2798/*
2799 * Finds the next token in *buf, dynamically allocates a buffer big
2800 * enough to hold a copy of it, and copies the token into the new
2801 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2802 * that a duplicate buffer is created even for a zero-length token.
2803 *
2804 * Returns a pointer to the newly-allocated duplicate, or a null
2805 * pointer if memory for the duplicate was not available. If
2806 * the lenp argument is a non-null pointer, the length of the token
2807 * (not including the '\0') is returned in *lenp.
2808 *
2809 * If successful, the *buf pointer will be updated to point beyond
2810 * the end of the found token.
2811 *
2812 * Note: uses GFP_KERNEL for allocation.
2813 */
2814static inline char *dup_token(const char **buf, size_t *lenp)
2815{
2816 char *dup;
2817 size_t len;
2818
2819 len = next_token(buf);
2820 dup = kmalloc(len + 1, GFP_KERNEL);
2821 if (!dup)
2822 return NULL;
2823
2824 memcpy(dup, *buf, len);
2825 *(dup + len) = '\0';
2826 *buf += len;
2827
2828 if (lenp)
2829 *lenp = len;
2830
2831 return dup;
2832}
2833
a725f65e 2834/*
3feeb894
AE
2835 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2836 * rbd_md_name, and name fields of the given rbd_dev, based on the
2837 * list of monitor addresses and other options provided via
2838 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2839 * copy of the snapshot name to map if successful, or a
2840 * pointer-coded error otherwise.
d22f76e7
AE
2841 *
2842 * Note: rbd_dev is assumed to have been initially zero-filled.
a725f65e 2843 */
0ddebc0c 2844static struct ceph_options *rbd_add_parse_args(struct rbd_device *rbd_dev,
819d52bf 2845 const char *buf)
e28fff26 2846{
d22f76e7 2847 size_t len;
0ddebc0c
AE
2848 const char *mon_addrs;
2849 size_t mon_addrs_size;
f28e565a
AE
2850 char *options;
2851 struct ceph_options *err_ptr = ERR_PTR(-EINVAL);
0ddebc0c
AE
2852 struct rbd_options rbd_opts;
2853 struct ceph_options *ceph_opts;
e28fff26
AE
2854
2855 /* The first four tokens are required */
2856
7ef3214a
AE
2857 len = next_token(&buf);
2858 if (!len)
f28e565a 2859 return err_ptr; /* Missing monitor address(es) */
0ddebc0c 2860 mon_addrs = buf;
f28e565a 2861 mon_addrs_size = len + 1;
7ef3214a 2862 buf += len;
a725f65e 2863
f28e565a
AE
2864 options = dup_token(&buf, NULL);
2865 if (!options)
2866 goto out_mem;
2867 if (!*options)
2868 goto out_err; /* Missing options */
e28fff26 2869
d22f76e7
AE
2870 rbd_dev->pool_name = dup_token(&buf, NULL);
2871 if (!rbd_dev->pool_name)
f28e565a
AE
2872 goto out_mem;
2873 if (!*rbd_dev->pool_name)
2874 goto out_err; /* Missing pool name */
e28fff26 2875
0bed54dc
AE
2876 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2877 if (!rbd_dev->image_name)
f28e565a
AE
2878 goto out_mem;
2879 if (!*rbd_dev->image_name)
2880 goto out_err; /* Missing image name */
d4b125e9 2881
f28e565a
AE
2882 /*
2883 * Snapshot name is optional; default is to use "-"
2884 * (indicating the head/no snapshot).
2885 */
3feeb894 2886 len = next_token(&buf);
820a5f3e 2887 if (!len) {
3feeb894
AE
2888 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2889 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
f28e565a
AE
2890 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
2891 err_ptr = ERR_PTR(-ENAMETOOLONG);
2892 goto out_err;
849b4260 2893 }
819d52bf
AE
2894 rbd_dev->snap_name = kmalloc(len + 1, GFP_KERNEL);
2895 if (!rbd_dev->snap_name)
f28e565a 2896 goto out_mem;
819d52bf
AE
2897 memcpy(rbd_dev->snap_name, buf, len);
2898 *(rbd_dev->snap_name + len) = '\0';
e5c35534 2899
0ddebc0c 2900 /* Initialize all rbd options to the defaults */
e28fff26 2901
0ddebc0c 2902 rbd_opts.read_only = RBD_READ_ONLY_DEFAULT;
d22f76e7 2903
0ddebc0c
AE
2904 ceph_opts = ceph_parse_options(options, mon_addrs,
2905 mon_addrs + mon_addrs_size - 1,
2906 parse_rbd_opts_token, &rbd_opts);
f28e565a 2907 kfree(options);
0ddebc0c
AE
2908
2909 /* Record the parsed rbd options */
2910
f28e565a 2911 if (!IS_ERR(ceph_opts))
0ddebc0c 2912 rbd_dev->mapping.read_only = rbd_opts.read_only;
0ddebc0c
AE
2913
2914 return ceph_opts;
f28e565a
AE
2915out_mem:
2916 err_ptr = ERR_PTR(-ENOMEM);
d22f76e7 2917out_err:
0bed54dc 2918 kfree(rbd_dev->image_name);
d78fd7ae
AE
2919 rbd_dev->image_name = NULL;
2920 rbd_dev->image_name_len = 0;
d22f76e7
AE
2921 kfree(rbd_dev->pool_name);
2922 rbd_dev->pool_name = NULL;
f28e565a 2923 kfree(options);
d22f76e7 2924
3feeb894 2925 return err_ptr;
a725f65e
AE
2926}
2927
589d30e0
AE
2928/*
2929 * An rbd format 2 image has a unique identifier, distinct from the
2930 * name given to it by the user. Internally, that identifier is
2931 * what's used to specify the names of objects related to the image.
2932 *
2933 * A special "rbd id" object is used to map an rbd image name to its
2934 * id. If that object doesn't exist, then there is no v2 rbd image
2935 * with the supplied name.
2936 *
2937 * This function will record the given rbd_dev's image_id field if
2938 * it can be determined, and in that case will return 0. If any
2939 * errors occur a negative errno will be returned and the rbd_dev's
2940 * image_id field will be unchanged (and should be NULL).
2941 */
2942static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2943{
2944 int ret;
2945 size_t size;
2946 char *object_name;
2947 void *response;
2948 void *p;
2949
2950 /*
2951 * First, see if the format 2 image id file exists, and if
2952 * so, get the image's persistent id from it.
2953 */
2954 size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2955 object_name = kmalloc(size, GFP_NOIO);
2956 if (!object_name)
2957 return -ENOMEM;
2958 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2959 dout("rbd id object name is %s\n", object_name);
2960
2961 /* Response will be an encoded string, which includes a length */
2962
2963 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2964 response = kzalloc(size, GFP_NOIO);
2965 if (!response) {
2966 ret = -ENOMEM;
2967 goto out;
2968 }
2969
2970 ret = rbd_req_sync_exec(rbd_dev, object_name,
2971 "rbd", "get_id",
2972 NULL, 0,
2973 response, RBD_IMAGE_ID_LEN_MAX,
2974 CEPH_OSD_FLAG_READ, NULL);
2975 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2976 if (ret < 0)
2977 goto out;
a0ea3a40 2978 ret = 0; /* rbd_req_sync_exec() can return positive */
589d30e0
AE
2979
2980 p = response;
2981 rbd_dev->image_id = ceph_extract_encoded_string(&p,
2982 p + RBD_IMAGE_ID_LEN_MAX,
2983 &rbd_dev->image_id_len,
2984 GFP_NOIO);
2985 if (IS_ERR(rbd_dev->image_id)) {
2986 ret = PTR_ERR(rbd_dev->image_id);
2987 rbd_dev->image_id = NULL;
2988 } else {
2989 dout("image_id is %s\n", rbd_dev->image_id);
2990 }
2991out:
2992 kfree(response);
2993 kfree(object_name);
2994
2995 return ret;
2996}
2997
a30b71b9
AE
2998static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
2999{
3000 int ret;
3001 size_t size;
3002
3003 /* Version 1 images have no id; empty string is used */
3004
3005 rbd_dev->image_id = kstrdup("", GFP_KERNEL);
3006 if (!rbd_dev->image_id)
3007 return -ENOMEM;
3008 rbd_dev->image_id_len = 0;
3009
3010 /* Record the header object name for this rbd image. */
3011
3012 size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX);
3013 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3014 if (!rbd_dev->header_name) {
3015 ret = -ENOMEM;
3016 goto out_err;
3017 }
3018 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
3019
3020 /* Populate rbd image metadata */
3021
3022 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3023 if (ret < 0)
3024 goto out_err;
3025 rbd_dev->image_format = 1;
3026
3027 dout("discovered version 1 image, header name is %s\n",
3028 rbd_dev->header_name);
3029
3030 return 0;
3031
3032out_err:
3033 kfree(rbd_dev->header_name);
3034 rbd_dev->header_name = NULL;
3035 kfree(rbd_dev->image_id);
3036 rbd_dev->image_id = NULL;
3037
3038 return ret;
3039}
3040
3041static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3042{
3043 size_t size;
9d475de5 3044 int ret;
6e14b1a6 3045 u64 ver = 0;
a30b71b9
AE
3046
3047 /*
3048 * Image id was filled in by the caller. Record the header
3049 * object name for this rbd image.
3050 */
3051 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len;
3052 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3053 if (!rbd_dev->header_name)
3054 return -ENOMEM;
3055 sprintf(rbd_dev->header_name, "%s%s",
3056 RBD_HEADER_PREFIX, rbd_dev->image_id);
9d475de5
AE
3057
3058 /* Get the size and object order for the image */
3059
3060 ret = rbd_dev_v2_image_size(rbd_dev);
1e130199
AE
3061 if (ret < 0)
3062 goto out_err;
3063
3064 /* Get the object prefix (a.k.a. block_name) for the image */
3065
3066 ret = rbd_dev_v2_object_prefix(rbd_dev);
b1b5402a
AE
3067 if (ret < 0)
3068 goto out_err;
3069
d889140c 3070 /* Get the and check features for the image */
b1b5402a
AE
3071
3072 ret = rbd_dev_v2_features(rbd_dev);
9d475de5
AE
3073 if (ret < 0)
3074 goto out_err;
35d489f9 3075
6e14b1a6
AE
3076 /* crypto and compression type aren't (yet) supported for v2 images */
3077
3078 rbd_dev->header.crypt_type = 0;
3079 rbd_dev->header.comp_type = 0;
35d489f9 3080
6e14b1a6
AE
3081 /* Get the snapshot context, plus the header version */
3082
3083 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
35d489f9
AE
3084 if (ret)
3085 goto out_err;
6e14b1a6
AE
3086 rbd_dev->header.obj_version = ver;
3087
a30b71b9
AE
3088 rbd_dev->image_format = 2;
3089
3090 dout("discovered version 2 image, header name is %s\n",
3091 rbd_dev->header_name);
3092
35152979 3093 return 0;
9d475de5
AE
3094out_err:
3095 kfree(rbd_dev->header_name);
3096 rbd_dev->header_name = NULL;
1e130199
AE
3097 kfree(rbd_dev->header.object_prefix);
3098 rbd_dev->header.object_prefix = NULL;
9d475de5
AE
3099
3100 return ret;
a30b71b9
AE
3101}
3102
3103/*
3104 * Probe for the existence of the header object for the given rbd
3105 * device. For format 2 images this includes determining the image
3106 * id.
3107 */
3108static int rbd_dev_probe(struct rbd_device *rbd_dev)
3109{
3110 int ret;
3111
3112 /*
3113 * Get the id from the image id object. If it's not a
3114 * format 2 image, we'll get ENOENT back, and we'll assume
3115 * it's a format 1 image.
3116 */
3117 ret = rbd_dev_image_id(rbd_dev);
3118 if (ret)
3119 ret = rbd_dev_v1_probe(rbd_dev);
3120 else
3121 ret = rbd_dev_v2_probe(rbd_dev);
3122 if (ret)
3123 dout("probe failed, returning %d\n", ret);
3124
3125 return ret;
3126}
3127
59c2be1e
YS
3128static ssize_t rbd_add(struct bus_type *bus,
3129 const char *buf,
3130 size_t count)
602adf40 3131{
cb8627c7 3132 struct rbd_device *rbd_dev = NULL;
78cea76e 3133 struct ceph_options *ceph_opts;
27cc2594
AE
3134 struct ceph_osd_client *osdc;
3135 int rc = -ENOMEM;
602adf40
YS
3136
3137 if (!try_module_get(THIS_MODULE))
3138 return -ENODEV;
3139
cb8627c7
AE
3140 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
3141 if (!rbd_dev)
85ae8926 3142 goto err_out_mem;
602adf40
YS
3143
3144 /* static rbd_device initialization */
3145 spin_lock_init(&rbd_dev->lock);
3146 INIT_LIST_HEAD(&rbd_dev->node);
dfc5606d 3147 INIT_LIST_HEAD(&rbd_dev->snaps);
c666601a 3148 init_rwsem(&rbd_dev->header_rwsem);
602adf40 3149
602adf40 3150 /* parse add command */
819d52bf 3151 ceph_opts = rbd_add_parse_args(rbd_dev, buf);
78cea76e
AE
3152 if (IS_ERR(ceph_opts)) {
3153 rc = PTR_ERR(ceph_opts);
0ddebc0c 3154 goto err_out_mem;
78cea76e
AE
3155 }
3156
78cea76e
AE
3157 rc = rbd_get_client(rbd_dev, ceph_opts);
3158 if (rc < 0)
0ddebc0c 3159 goto err_out_args;
78cea76e 3160 ceph_opts = NULL; /* ceph_opts now owned by rbd_dev client */
602adf40 3161
602adf40 3162 /* pick the pool */
1dbb4399 3163 osdc = &rbd_dev->rbd_client->client->osdc;
602adf40
YS
3164 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
3165 if (rc < 0)
3166 goto err_out_client;
86992098 3167 rbd_dev->pool_id = (u64) rc;
602adf40 3168
a30b71b9
AE
3169 rc = rbd_dev_probe(rbd_dev);
3170 if (rc < 0)
05fd6f6f
AE
3171 goto err_out_client;
3172
3173 /* no need to lock here, as rbd_dev is not registered yet */
3174 rc = rbd_dev_snaps_update(rbd_dev);
3175 if (rc)
41f38c2b 3176 goto err_out_probe;
05fd6f6f 3177
819d52bf 3178 rc = rbd_dev_set_mapping(rbd_dev);
05fd6f6f 3179 if (rc)
41f38c2b 3180 goto err_out_snaps;
05fd6f6f 3181
85ae8926
AE
3182 /* generate unique id: find highest unique id, add one */
3183 rbd_dev_id_get(rbd_dev);
3184
3185 /* Fill in the device name, now that we have its id. */
3186 BUILD_BUG_ON(DEV_NAME_LEN
3187 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3188 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3189
3190 /* Get our block major device number. */
3191
27cc2594
AE
3192 rc = register_blkdev(0, rbd_dev->name);
3193 if (rc < 0)
85ae8926 3194 goto err_out_id;
27cc2594 3195 rbd_dev->major = rc;
602adf40 3196
0f308a31
AE
3197 /* Set up the blkdev mapping. */
3198
3199 rc = rbd_init_disk(rbd_dev);
dfc5606d 3200 if (rc)
766fc439
YS
3201 goto err_out_blkdev;
3202
0f308a31
AE
3203 rc = rbd_bus_add_dev(rbd_dev);
3204 if (rc)
3205 goto err_out_disk;
3206
32eec68d
AE
3207 /*
3208 * At this point cleanup in the event of an error is the job
3209 * of the sysfs code (initiated by rbd_bus_del_dev()).
32eec68d 3210 */
2ac4e75d 3211
4bb1f1ed 3212 down_write(&rbd_dev->header_rwsem);
5ed16177 3213 rc = rbd_dev_snaps_register(rbd_dev);
4bb1f1ed 3214 up_write(&rbd_dev->header_rwsem);
2ac4e75d
AE
3215 if (rc)
3216 goto err_out_bus;
3217
3ee4001e
AE
3218 rc = rbd_init_watch_dev(rbd_dev);
3219 if (rc)
3220 goto err_out_bus;
3221
2ac4e75d
AE
3222 /* Everything's ready. Announce the disk to the world. */
3223
2ac4e75d 3224 add_disk(rbd_dev->disk);
3ee4001e 3225
2ac4e75d
AE
3226 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3227 (unsigned long long) rbd_dev->mapping.size);
3228
602adf40
YS
3229 return count;
3230
766fc439 3231err_out_bus:
766fc439
YS
3232 /* this will also clean up rest of rbd_dev stuff */
3233
3234 rbd_bus_del_dev(rbd_dev);
766fc439
YS
3235 return rc;
3236
0f308a31
AE
3237err_out_disk:
3238 rbd_free_disk(rbd_dev);
602adf40
YS
3239err_out_blkdev:
3240 unregister_blkdev(rbd_dev->major, rbd_dev->name);
85ae8926
AE
3241err_out_id:
3242 rbd_dev_id_put(rbd_dev);
41f38c2b
AE
3243err_out_snaps:
3244 rbd_remove_all_snaps(rbd_dev);
3245err_out_probe:
05fd6f6f 3246 rbd_header_free(&rbd_dev->header);
602adf40 3247err_out_client:
3fcf2581 3248 kfree(rbd_dev->header_name);
602adf40 3249 rbd_put_client(rbd_dev);
589d30e0 3250 kfree(rbd_dev->image_id);
0ddebc0c 3251err_out_args:
78cea76e
AE
3252 if (ceph_opts)
3253 ceph_destroy_options(ceph_opts);
971f839a 3254 kfree(rbd_dev->snap_name);
85ae8926
AE
3255 kfree(rbd_dev->image_name);
3256 kfree(rbd_dev->pool_name);
3257err_out_mem:
27cc2594
AE
3258 kfree(rbd_dev);
3259
602adf40
YS
3260 dout("Error adding device %s\n", buf);
3261 module_put(THIS_MODULE);
27cc2594
AE
3262
3263 return (ssize_t) rc;
602adf40
YS
3264}
3265
de71a297 3266static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
602adf40
YS
3267{
3268 struct list_head *tmp;
3269 struct rbd_device *rbd_dev;
3270
e124a82f 3271 spin_lock(&rbd_dev_list_lock);
602adf40
YS
3272 list_for_each(tmp, &rbd_dev_list) {
3273 rbd_dev = list_entry(tmp, struct rbd_device, node);
de71a297 3274 if (rbd_dev->dev_id == dev_id) {
e124a82f 3275 spin_unlock(&rbd_dev_list_lock);
602adf40 3276 return rbd_dev;
e124a82f 3277 }
602adf40 3278 }
e124a82f 3279 spin_unlock(&rbd_dev_list_lock);
602adf40
YS
3280 return NULL;
3281}
3282
dfc5606d 3283static void rbd_dev_release(struct device *dev)
602adf40 3284{
593a9e7b 3285 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 3286
1dbb4399
AE
3287 if (rbd_dev->watch_request) {
3288 struct ceph_client *client = rbd_dev->rbd_client->client;
3289
3290 ceph_osdc_unregister_linger_request(&client->osdc,
59c2be1e 3291 rbd_dev->watch_request);
1dbb4399 3292 }
59c2be1e 3293 if (rbd_dev->watch_event)
070c633f 3294 rbd_req_sync_unwatch(rbd_dev);
59c2be1e 3295
602adf40
YS
3296 rbd_put_client(rbd_dev);
3297
3298 /* clean up and free blkdev */
3299 rbd_free_disk(rbd_dev);
3300 unregister_blkdev(rbd_dev->major, rbd_dev->name);
32eec68d 3301
2ac4e75d
AE
3302 /* release allocated disk header fields */
3303 rbd_header_free(&rbd_dev->header);
3304
32eec68d 3305 /* done with the id, and with the rbd_dev */
971f839a 3306 kfree(rbd_dev->snap_name);
589d30e0 3307 kfree(rbd_dev->image_id);
0bed54dc 3308 kfree(rbd_dev->header_name);
d22f76e7 3309 kfree(rbd_dev->pool_name);
0bed54dc 3310 kfree(rbd_dev->image_name);
e2839308 3311 rbd_dev_id_put(rbd_dev);
602adf40
YS
3312 kfree(rbd_dev);
3313
3314 /* release module ref */
3315 module_put(THIS_MODULE);
602adf40
YS
3316}
3317
dfc5606d
YS
3318static ssize_t rbd_remove(struct bus_type *bus,
3319 const char *buf,
3320 size_t count)
602adf40
YS
3321{
3322 struct rbd_device *rbd_dev = NULL;
3323 int target_id, rc;
3324 unsigned long ul;
3325 int ret = count;
3326
3327 rc = strict_strtoul(buf, 10, &ul);
3328 if (rc)
3329 return rc;
3330
3331 /* convert to int; abort if we lost anything in the conversion */
3332 target_id = (int) ul;
3333 if (target_id != ul)
3334 return -EINVAL;
3335
3336 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3337
3338 rbd_dev = __rbd_get_dev(target_id);
3339 if (!rbd_dev) {
3340 ret = -ENOENT;
3341 goto done;
3342 }
3343
41f38c2b 3344 rbd_remove_all_snaps(rbd_dev);
dfc5606d 3345 rbd_bus_del_dev(rbd_dev);
602adf40
YS
3346
3347done:
3348 mutex_unlock(&ctl_mutex);
aafb230e 3349
602adf40
YS
3350 return ret;
3351}
3352
602adf40
YS
3353/*
3354 * create control files in sysfs
dfc5606d 3355 * /sys/bus/rbd/...
602adf40
YS
3356 */
3357static int rbd_sysfs_init(void)
3358{
dfc5606d 3359 int ret;
602adf40 3360
fed4c143 3361 ret = device_register(&rbd_root_dev);
21079786 3362 if (ret < 0)
dfc5606d 3363 return ret;
602adf40 3364
fed4c143
AE
3365 ret = bus_register(&rbd_bus_type);
3366 if (ret < 0)
3367 device_unregister(&rbd_root_dev);
602adf40 3368
602adf40
YS
3369 return ret;
3370}
3371
3372static void rbd_sysfs_cleanup(void)
3373{
dfc5606d 3374 bus_unregister(&rbd_bus_type);
fed4c143 3375 device_unregister(&rbd_root_dev);
602adf40
YS
3376}
3377
3378int __init rbd_init(void)
3379{
3380 int rc;
3381
3382 rc = rbd_sysfs_init();
3383 if (rc)
3384 return rc;
f0f8cef5 3385 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
602adf40
YS
3386 return 0;
3387}
3388
3389void __exit rbd_exit(void)
3390{
3391 rbd_sysfs_cleanup();
3392}
3393
3394module_init(rbd_init);
3395module_exit(rbd_exit);
3396
3397MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3398MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3399MODULE_DESCRIPTION("rados block device");
3400
3401/* following authorship retained from original osdblk.c */
3402MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3403
3404MODULE_LICENSE("GPL");