]> git.ipfire.org Git - people/arne_f/kernel.git/blame - drivers/block/rbd.c
rbd: allow null image name
[people/arne_f/kernel.git] / drivers / block / rbd.c
CommitLineData
602adf40
YS
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
dfc5606d 24 For usage instructions, please refer to:
602adf40 25
dfc5606d 26 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
59c2be1e 34#include <linux/parser.h>
602adf40
YS
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
aafb230e
AE
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
593a9e7b
AE
46/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
df111be6
AE
55/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
f0f8cef5
AE
59#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
602adf40
YS
61
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
d4b125e9
AE
64#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
65#define RBD_MAX_SNAP_NAME_LEN \
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67
35d489f9 68#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
602adf40
YS
69#define RBD_MAX_OPT_LEN 1024
70
71#define RBD_SNAP_HEAD_NAME "-"
72
1e130199
AE
73#define RBD_IMAGE_ID_LEN_MAX 64
74#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 75
d889140c
AE
76/* Feature bits */
77
78#define RBD_FEATURE_LAYERING 1
79
80/* Features supported by this (client software) implementation. */
81
82#define RBD_FEATURES_ALL (0)
83
81a89793
AE
84/*
85 * An RBD device name will be "rbd#", where the "rbd" comes from
86 * RBD_DRV_NAME above, and # is a unique integer identifier.
87 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
88 * enough to hold all possible device names.
89 */
602adf40 90#define DEV_NAME_LEN 32
81a89793 91#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
602adf40 92
cc0538b6 93#define RBD_READ_ONLY_DEFAULT false
59c2be1e 94
602adf40
YS
95/*
96 * block device image metadata (in-memory version)
97 */
98struct rbd_image_header {
f84344f3 99 /* These four fields never change for a given rbd image */
849b4260 100 char *object_prefix;
34b13184 101 u64 features;
602adf40
YS
102 __u8 obj_order;
103 __u8 crypt_type;
104 __u8 comp_type;
602adf40 105
f84344f3
AE
106 /* The remaining fields need to be updated occasionally */
107 u64 image_size;
108 struct ceph_snap_context *snapc;
602adf40
YS
109 char *snap_names;
110 u64 *snap_sizes;
59c2be1e
YS
111
112 u64 obj_version;
113};
114
0d7dbfce
AE
115/*
116 * An rbd image specification.
117 *
118 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
119 * identify an image.
120 */
121struct rbd_spec {
122 u64 pool_id;
123 char *pool_name;
124
125 char *image_id;
126 size_t image_id_len;
127 char *image_name;
128 size_t image_name_len;
129
130 u64 snap_id;
131 char *snap_name;
132
133 struct kref kref;
134};
135
59c2be1e 136struct rbd_options {
cc0538b6 137 bool read_only;
602adf40
YS
138};
139
140/*
f0f8cef5 141 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
142 */
143struct rbd_client {
144 struct ceph_client *client;
145 struct kref kref;
146 struct list_head node;
147};
148
149/*
f0f8cef5 150 * a request completion status
602adf40 151 */
1fec7093
YS
152struct rbd_req_status {
153 int done;
154 int rc;
155 u64 bytes;
156};
157
158/*
159 * a collection of requests
160 */
161struct rbd_req_coll {
162 int total;
163 int num_done;
164 struct kref kref;
165 struct rbd_req_status status[0];
602adf40
YS
166};
167
f0f8cef5
AE
168/*
169 * a single io request
170 */
171struct rbd_request {
172 struct request *rq; /* blk layer request */
173 struct bio *bio; /* cloned bio */
174 struct page **pages; /* list of used pages */
175 u64 len;
176 int coll_index;
177 struct rbd_req_coll *coll;
178};
179
dfc5606d
YS
180struct rbd_snap {
181 struct device dev;
182 const char *name;
3591538f 183 u64 size;
dfc5606d
YS
184 struct list_head node;
185 u64 id;
34b13184 186 u64 features;
dfc5606d
YS
187};
188
f84344f3 189struct rbd_mapping {
99c1f08f 190 u64 size;
34b13184 191 u64 features;
f84344f3
AE
192 bool read_only;
193};
194
602adf40
YS
195/*
196 * a single device
197 */
198struct rbd_device {
de71a297 199 int dev_id; /* blkdev unique id */
602adf40
YS
200
201 int major; /* blkdev assigned major */
202 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 203
a30b71b9 204 u32 image_format; /* Either 1 or 2 */
602adf40
YS
205 struct rbd_client *rbd_client;
206
207 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
208
209 spinlock_t lock; /* queue lock */
210
211 struct rbd_image_header header;
daba5fdb 212 bool exists;
0d7dbfce 213 struct rbd_spec *spec;
602adf40 214
0d7dbfce 215 char *header_name;
971f839a 216
59c2be1e
YS
217 struct ceph_osd_event *watch_event;
218 struct ceph_osd_request *watch_request;
219
c666601a
JD
220 /* protects updating the header */
221 struct rw_semaphore header_rwsem;
f84344f3
AE
222
223 struct rbd_mapping mapping;
602adf40
YS
224
225 struct list_head node;
dfc5606d
YS
226
227 /* list of snapshots */
228 struct list_head snaps;
229
230 /* sysfs related */
231 struct device dev;
232};
233
602adf40 234static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
e124a82f 235
602adf40 236static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
237static DEFINE_SPINLOCK(rbd_dev_list_lock);
238
432b8587
AE
239static LIST_HEAD(rbd_client_list); /* clients */
240static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 241
304f6808
AE
242static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
243static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
244
dfc5606d 245static void rbd_dev_release(struct device *dev);
41f38c2b 246static void rbd_remove_snap_dev(struct rbd_snap *snap);
dfc5606d 247
f0f8cef5
AE
248static ssize_t rbd_add(struct bus_type *bus, const char *buf,
249 size_t count);
250static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
251 size_t count);
252
253static struct bus_attribute rbd_bus_attrs[] = {
254 __ATTR(add, S_IWUSR, NULL, rbd_add),
255 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
256 __ATTR_NULL
257};
258
259static struct bus_type rbd_bus_type = {
260 .name = "rbd",
261 .bus_attrs = rbd_bus_attrs,
262};
263
264static void rbd_root_dev_release(struct device *dev)
265{
266}
267
268static struct device rbd_root_dev = {
269 .init_name = "rbd",
270 .release = rbd_root_dev_release,
271};
272
aafb230e
AE
273#ifdef RBD_DEBUG
274#define rbd_assert(expr) \
275 if (unlikely(!(expr))) { \
276 printk(KERN_ERR "\nAssertion failure in %s() " \
277 "at line %d:\n\n" \
278 "\trbd_assert(%s);\n\n", \
279 __func__, __LINE__, #expr); \
280 BUG(); \
281 }
282#else /* !RBD_DEBUG */
283# define rbd_assert(expr) ((void) 0)
284#endif /* !RBD_DEBUG */
dfc5606d 285
dfc5606d
YS
286static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
287{
288 return get_device(&rbd_dev->dev);
289}
290
291static void rbd_put_dev(struct rbd_device *rbd_dev)
292{
293 put_device(&rbd_dev->dev);
294}
602adf40 295
117973fb
AE
296static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
297static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
59c2be1e 298
602adf40
YS
299static int rbd_open(struct block_device *bdev, fmode_t mode)
300{
f0f8cef5 301 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
602adf40 302
f84344f3 303 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
602adf40
YS
304 return -EROFS;
305
340c7a2b 306 rbd_get_dev(rbd_dev);
f84344f3 307 set_device_ro(bdev, rbd_dev->mapping.read_only);
340c7a2b 308
602adf40
YS
309 return 0;
310}
311
dfc5606d
YS
312static int rbd_release(struct gendisk *disk, fmode_t mode)
313{
314 struct rbd_device *rbd_dev = disk->private_data;
315
316 rbd_put_dev(rbd_dev);
317
318 return 0;
319}
320
602adf40
YS
321static const struct block_device_operations rbd_bd_ops = {
322 .owner = THIS_MODULE,
323 .open = rbd_open,
dfc5606d 324 .release = rbd_release,
602adf40
YS
325};
326
327/*
328 * Initialize an rbd client instance.
43ae4701 329 * We own *ceph_opts.
602adf40 330 */
f8c38929 331static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
332{
333 struct rbd_client *rbdc;
334 int ret = -ENOMEM;
335
336 dout("rbd_client_create\n");
337 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
338 if (!rbdc)
339 goto out_opt;
340
341 kref_init(&rbdc->kref);
342 INIT_LIST_HEAD(&rbdc->node);
343
bc534d86
AE
344 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
345
43ae4701 346 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
602adf40 347 if (IS_ERR(rbdc->client))
bc534d86 348 goto out_mutex;
43ae4701 349 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
350
351 ret = ceph_open_session(rbdc->client);
352 if (ret < 0)
353 goto out_err;
354
432b8587 355 spin_lock(&rbd_client_list_lock);
602adf40 356 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 357 spin_unlock(&rbd_client_list_lock);
602adf40 358
bc534d86
AE
359 mutex_unlock(&ctl_mutex);
360
602adf40
YS
361 dout("rbd_client_create created %p\n", rbdc);
362 return rbdc;
363
364out_err:
365 ceph_destroy_client(rbdc->client);
bc534d86
AE
366out_mutex:
367 mutex_unlock(&ctl_mutex);
602adf40
YS
368 kfree(rbdc);
369out_opt:
43ae4701
AE
370 if (ceph_opts)
371 ceph_destroy_options(ceph_opts);
28f259b7 372 return ERR_PTR(ret);
602adf40
YS
373}
374
375/*
1f7ba331
AE
376 * Find a ceph client with specific addr and configuration. If
377 * found, bump its reference count.
602adf40 378 */
1f7ba331 379static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
380{
381 struct rbd_client *client_node;
1f7ba331 382 bool found = false;
602adf40 383
43ae4701 384 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
385 return NULL;
386
1f7ba331
AE
387 spin_lock(&rbd_client_list_lock);
388 list_for_each_entry(client_node, &rbd_client_list, node) {
389 if (!ceph_compare_options(ceph_opts, client_node->client)) {
390 kref_get(&client_node->kref);
391 found = true;
392 break;
393 }
394 }
395 spin_unlock(&rbd_client_list_lock);
396
397 return found ? client_node : NULL;
602adf40
YS
398}
399
59c2be1e
YS
400/*
401 * mount options
402 */
403enum {
59c2be1e
YS
404 Opt_last_int,
405 /* int args above */
406 Opt_last_string,
407 /* string args above */
cc0538b6
AE
408 Opt_read_only,
409 Opt_read_write,
410 /* Boolean args above */
411 Opt_last_bool,
59c2be1e
YS
412};
413
43ae4701 414static match_table_t rbd_opts_tokens = {
59c2be1e
YS
415 /* int args above */
416 /* string args above */
be466c1c 417 {Opt_read_only, "read_only"},
cc0538b6
AE
418 {Opt_read_only, "ro"}, /* Alternate spelling */
419 {Opt_read_write, "read_write"},
420 {Opt_read_write, "rw"}, /* Alternate spelling */
421 /* Boolean args above */
59c2be1e
YS
422 {-1, NULL}
423};
424
425static int parse_rbd_opts_token(char *c, void *private)
426{
43ae4701 427 struct rbd_options *rbd_opts = private;
59c2be1e
YS
428 substring_t argstr[MAX_OPT_ARGS];
429 int token, intval, ret;
430
43ae4701 431 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
432 if (token < 0)
433 return -EINVAL;
434
435 if (token < Opt_last_int) {
436 ret = match_int(&argstr[0], &intval);
437 if (ret < 0) {
438 pr_err("bad mount option arg (not int) "
439 "at '%s'\n", c);
440 return ret;
441 }
442 dout("got int token %d val %d\n", token, intval);
443 } else if (token > Opt_last_int && token < Opt_last_string) {
444 dout("got string token %d val %s\n", token,
445 argstr[0].from);
cc0538b6
AE
446 } else if (token > Opt_last_string && token < Opt_last_bool) {
447 dout("got Boolean token %d\n", token);
59c2be1e
YS
448 } else {
449 dout("got token %d\n", token);
450 }
451
452 switch (token) {
cc0538b6
AE
453 case Opt_read_only:
454 rbd_opts->read_only = true;
455 break;
456 case Opt_read_write:
457 rbd_opts->read_only = false;
458 break;
59c2be1e 459 default:
aafb230e
AE
460 rbd_assert(false);
461 break;
59c2be1e
YS
462 }
463 return 0;
464}
465
602adf40
YS
466/*
467 * Get a ceph client with specific addr and configuration, if one does
468 * not exist create it.
469 */
9d3997fd 470static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
602adf40 471{
f8c38929 472 struct rbd_client *rbdc;
59c2be1e 473
1f7ba331 474 rbdc = rbd_client_find(ceph_opts);
9d3997fd 475 if (rbdc) /* using an existing client */
43ae4701 476 ceph_destroy_options(ceph_opts);
9d3997fd 477 else
f8c38929 478 rbdc = rbd_client_create(ceph_opts);
602adf40 479
9d3997fd 480 return rbdc;
602adf40
YS
481}
482
483/*
484 * Destroy ceph client
d23a4b3f 485 *
432b8587 486 * Caller must hold rbd_client_list_lock.
602adf40
YS
487 */
488static void rbd_client_release(struct kref *kref)
489{
490 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
491
492 dout("rbd_release_client %p\n", rbdc);
cd9d9f5d 493 spin_lock(&rbd_client_list_lock);
602adf40 494 list_del(&rbdc->node);
cd9d9f5d 495 spin_unlock(&rbd_client_list_lock);
602adf40
YS
496
497 ceph_destroy_client(rbdc->client);
498 kfree(rbdc);
499}
500
501/*
502 * Drop reference to ceph client node. If it's not referenced anymore, release
503 * it.
504 */
9d3997fd 505static void rbd_put_client(struct rbd_client *rbdc)
602adf40 506{
c53d5893
AE
507 if (rbdc)
508 kref_put(&rbdc->kref, rbd_client_release);
602adf40
YS
509}
510
1fec7093
YS
511/*
512 * Destroy requests collection
513 */
514static void rbd_coll_release(struct kref *kref)
515{
516 struct rbd_req_coll *coll =
517 container_of(kref, struct rbd_req_coll, kref);
518
519 dout("rbd_coll_release %p\n", coll);
520 kfree(coll);
521}
602adf40 522
a30b71b9
AE
523static bool rbd_image_format_valid(u32 image_format)
524{
525 return image_format == 1 || image_format == 2;
526}
527
8e94af8e
AE
528static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
529{
103a150f
AE
530 size_t size;
531 u32 snap_count;
532
533 /* The header has to start with the magic rbd header text */
534 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
535 return false;
536
db2388b6
AE
537 /* The bio layer requires at least sector-sized I/O */
538
539 if (ondisk->options.order < SECTOR_SHIFT)
540 return false;
541
542 /* If we use u64 in a few spots we may be able to loosen this */
543
544 if (ondisk->options.order > 8 * sizeof (int) - 1)
545 return false;
546
103a150f
AE
547 /*
548 * The size of a snapshot header has to fit in a size_t, and
549 * that limits the number of snapshots.
550 */
551 snap_count = le32_to_cpu(ondisk->snap_count);
552 size = SIZE_MAX - sizeof (struct ceph_snap_context);
553 if (snap_count > size / sizeof (__le64))
554 return false;
555
556 /*
557 * Not only that, but the size of the entire the snapshot
558 * header must also be representable in a size_t.
559 */
560 size -= snap_count * sizeof (__le64);
561 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
562 return false;
563
564 return true;
8e94af8e
AE
565}
566
602adf40
YS
567/*
568 * Create a new header structure, translate header format from the on-disk
569 * header.
570 */
571static int rbd_header_from_disk(struct rbd_image_header *header,
4156d998 572 struct rbd_image_header_ondisk *ondisk)
602adf40 573{
ccece235 574 u32 snap_count;
58c17b0e 575 size_t len;
d2bb24e5 576 size_t size;
621901d6 577 u32 i;
602adf40 578
6a52325f
AE
579 memset(header, 0, sizeof (*header));
580
103a150f
AE
581 snap_count = le32_to_cpu(ondisk->snap_count);
582
58c17b0e
AE
583 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
584 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6a52325f 585 if (!header->object_prefix)
602adf40 586 return -ENOMEM;
58c17b0e
AE
587 memcpy(header->object_prefix, ondisk->object_prefix, len);
588 header->object_prefix[len] = '\0';
00f1f36f 589
602adf40 590 if (snap_count) {
f785cc1d
AE
591 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
592
621901d6
AE
593 /* Save a copy of the snapshot names */
594
f785cc1d
AE
595 if (snap_names_len > (u64) SIZE_MAX)
596 return -EIO;
597 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
602adf40 598 if (!header->snap_names)
6a52325f 599 goto out_err;
f785cc1d
AE
600 /*
601 * Note that rbd_dev_v1_header_read() guarantees
602 * the ondisk buffer we're working with has
603 * snap_names_len bytes beyond the end of the
604 * snapshot id array, this memcpy() is safe.
605 */
606 memcpy(header->snap_names, &ondisk->snaps[snap_count],
607 snap_names_len);
6a52325f 608
621901d6
AE
609 /* Record each snapshot's size */
610
d2bb24e5
AE
611 size = snap_count * sizeof (*header->snap_sizes);
612 header->snap_sizes = kmalloc(size, GFP_KERNEL);
602adf40 613 if (!header->snap_sizes)
6a52325f 614 goto out_err;
621901d6
AE
615 for (i = 0; i < snap_count; i++)
616 header->snap_sizes[i] =
617 le64_to_cpu(ondisk->snaps[i].image_size);
602adf40 618 } else {
ccece235 619 WARN_ON(ondisk->snap_names_len);
602adf40
YS
620 header->snap_names = NULL;
621 header->snap_sizes = NULL;
622 }
849b4260 623
34b13184 624 header->features = 0; /* No features support in v1 images */
602adf40
YS
625 header->obj_order = ondisk->options.order;
626 header->crypt_type = ondisk->options.crypt_type;
627 header->comp_type = ondisk->options.comp_type;
6a52325f 628
621901d6
AE
629 /* Allocate and fill in the snapshot context */
630
f84344f3 631 header->image_size = le64_to_cpu(ondisk->image_size);
6a52325f
AE
632 size = sizeof (struct ceph_snap_context);
633 size += snap_count * sizeof (header->snapc->snaps[0]);
634 header->snapc = kzalloc(size, GFP_KERNEL);
635 if (!header->snapc)
636 goto out_err;
602adf40
YS
637
638 atomic_set(&header->snapc->nref, 1);
505cbb9b 639 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 640 header->snapc->num_snaps = snap_count;
621901d6
AE
641 for (i = 0; i < snap_count; i++)
642 header->snapc->snaps[i] =
643 le64_to_cpu(ondisk->snaps[i].id);
602adf40
YS
644
645 return 0;
646
6a52325f 647out_err:
849b4260 648 kfree(header->snap_sizes);
ccece235 649 header->snap_sizes = NULL;
602adf40 650 kfree(header->snap_names);
ccece235 651 header->snap_names = NULL;
6a52325f
AE
652 kfree(header->object_prefix);
653 header->object_prefix = NULL;
ccece235 654
00f1f36f 655 return -ENOMEM;
602adf40
YS
656}
657
8836b995 658static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
602adf40 659{
602adf40 660
e86924a8 661 struct rbd_snap *snap;
602adf40 662
e86924a8
AE
663 list_for_each_entry(snap, &rbd_dev->snaps, node) {
664 if (!strcmp(snap_name, snap->name)) {
0d7dbfce 665 rbd_dev->spec->snap_id = snap->id;
e86924a8 666 rbd_dev->mapping.size = snap->size;
34b13184 667 rbd_dev->mapping.features = snap->features;
602adf40 668
e86924a8 669 return 0;
00f1f36f 670 }
00f1f36f 671 }
e86924a8 672
00f1f36f 673 return -ENOENT;
602adf40
YS
674}
675
819d52bf 676static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
602adf40 677{
78dc447d 678 int ret;
602adf40 679
0d7dbfce 680 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
cc9d734c 681 sizeof (RBD_SNAP_HEAD_NAME))) {
0d7dbfce 682 rbd_dev->spec->snap_id = CEPH_NOSNAP;
99c1f08f 683 rbd_dev->mapping.size = rbd_dev->header.image_size;
34b13184 684 rbd_dev->mapping.features = rbd_dev->header.features;
e86924a8 685 ret = 0;
602adf40 686 } else {
0d7dbfce 687 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
602adf40
YS
688 if (ret < 0)
689 goto done;
f84344f3 690 rbd_dev->mapping.read_only = true;
602adf40 691 }
daba5fdb 692 rbd_dev->exists = true;
602adf40 693done:
602adf40
YS
694 return ret;
695}
696
697static void rbd_header_free(struct rbd_image_header *header)
698{
849b4260 699 kfree(header->object_prefix);
d78fd7ae 700 header->object_prefix = NULL;
602adf40 701 kfree(header->snap_sizes);
d78fd7ae 702 header->snap_sizes = NULL;
849b4260 703 kfree(header->snap_names);
d78fd7ae 704 header->snap_names = NULL;
d1d25646 705 ceph_put_snap_context(header->snapc);
d78fd7ae 706 header->snapc = NULL;
602adf40
YS
707}
708
65ccfe21 709static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
602adf40 710{
65ccfe21
AE
711 char *name;
712 u64 segment;
713 int ret;
602adf40 714
65ccfe21
AE
715 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
716 if (!name)
717 return NULL;
718 segment = offset >> rbd_dev->header.obj_order;
719 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
720 rbd_dev->header.object_prefix, segment);
721 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
722 pr_err("error formatting segment name for #%llu (%d)\n",
723 segment, ret);
724 kfree(name);
725 name = NULL;
726 }
602adf40 727
65ccfe21
AE
728 return name;
729}
602adf40 730
65ccfe21
AE
731static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
732{
733 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
602adf40 734
65ccfe21
AE
735 return offset & (segment_size - 1);
736}
737
738static u64 rbd_segment_length(struct rbd_device *rbd_dev,
739 u64 offset, u64 length)
740{
741 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
742
743 offset &= segment_size - 1;
744
aafb230e 745 rbd_assert(length <= U64_MAX - offset);
65ccfe21
AE
746 if (offset + length > segment_size)
747 length = segment_size - offset;
748
749 return length;
602adf40
YS
750}
751
1fec7093
YS
752static int rbd_get_num_segments(struct rbd_image_header *header,
753 u64 ofs, u64 len)
754{
df111be6
AE
755 u64 start_seg;
756 u64 end_seg;
757
758 if (!len)
759 return 0;
760 if (len - 1 > U64_MAX - ofs)
761 return -ERANGE;
762
763 start_seg = ofs >> header->obj_order;
764 end_seg = (ofs + len - 1) >> header->obj_order;
765
1fec7093
YS
766 return end_seg - start_seg + 1;
767}
768
029bcbd8
JD
769/*
770 * returns the size of an object in the image
771 */
772static u64 rbd_obj_bytes(struct rbd_image_header *header)
773{
774 return 1 << header->obj_order;
775}
776
602adf40
YS
777/*
778 * bio helpers
779 */
780
781static void bio_chain_put(struct bio *chain)
782{
783 struct bio *tmp;
784
785 while (chain) {
786 tmp = chain;
787 chain = chain->bi_next;
788 bio_put(tmp);
789 }
790}
791
792/*
793 * zeros a bio chain, starting at specific offset
794 */
795static void zero_bio_chain(struct bio *chain, int start_ofs)
796{
797 struct bio_vec *bv;
798 unsigned long flags;
799 void *buf;
800 int i;
801 int pos = 0;
802
803 while (chain) {
804 bio_for_each_segment(bv, chain, i) {
805 if (pos + bv->bv_len > start_ofs) {
806 int remainder = max(start_ofs - pos, 0);
807 buf = bvec_kmap_irq(bv, &flags);
808 memset(buf + remainder, 0,
809 bv->bv_len - remainder);
85b5aaa6 810 bvec_kunmap_irq(buf, &flags);
602adf40
YS
811 }
812 pos += bv->bv_len;
813 }
814
815 chain = chain->bi_next;
816 }
817}
818
819/*
f7760dad
AE
820 * Clone a portion of a bio, starting at the given byte offset
821 * and continuing for the number of bytes indicated.
602adf40 822 */
f7760dad
AE
823static struct bio *bio_clone_range(struct bio *bio_src,
824 unsigned int offset,
825 unsigned int len,
826 gfp_t gfpmask)
602adf40 827{
f7760dad
AE
828 struct bio_vec *bv;
829 unsigned int resid;
830 unsigned short idx;
831 unsigned int voff;
832 unsigned short end_idx;
833 unsigned short vcnt;
834 struct bio *bio;
835
836 /* Handle the easy case for the caller */
837
838 if (!offset && len == bio_src->bi_size)
839 return bio_clone(bio_src, gfpmask);
840
841 if (WARN_ON_ONCE(!len))
842 return NULL;
843 if (WARN_ON_ONCE(len > bio_src->bi_size))
844 return NULL;
845 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
846 return NULL;
847
848 /* Find first affected segment... */
849
850 resid = offset;
851 __bio_for_each_segment(bv, bio_src, idx, 0) {
852 if (resid < bv->bv_len)
853 break;
854 resid -= bv->bv_len;
602adf40 855 }
f7760dad 856 voff = resid;
602adf40 857
f7760dad 858 /* ...and the last affected segment */
602adf40 859
f7760dad
AE
860 resid += len;
861 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
862 if (resid <= bv->bv_len)
863 break;
864 resid -= bv->bv_len;
865 }
866 vcnt = end_idx - idx + 1;
867
868 /* Build the clone */
869
870 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
871 if (!bio)
872 return NULL; /* ENOMEM */
602adf40 873
f7760dad
AE
874 bio->bi_bdev = bio_src->bi_bdev;
875 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
876 bio->bi_rw = bio_src->bi_rw;
877 bio->bi_flags |= 1 << BIO_CLONED;
878
879 /*
880 * Copy over our part of the bio_vec, then update the first
881 * and last (or only) entries.
882 */
883 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
884 vcnt * sizeof (struct bio_vec));
885 bio->bi_io_vec[0].bv_offset += voff;
886 if (vcnt > 1) {
887 bio->bi_io_vec[0].bv_len -= voff;
888 bio->bi_io_vec[vcnt - 1].bv_len = resid;
889 } else {
890 bio->bi_io_vec[0].bv_len = len;
602adf40
YS
891 }
892
f7760dad
AE
893 bio->bi_vcnt = vcnt;
894 bio->bi_size = len;
895 bio->bi_idx = 0;
896
897 return bio;
898}
899
900/*
901 * Clone a portion of a bio chain, starting at the given byte offset
902 * into the first bio in the source chain and continuing for the
903 * number of bytes indicated. The result is another bio chain of
904 * exactly the given length, or a null pointer on error.
905 *
906 * The bio_src and offset parameters are both in-out. On entry they
907 * refer to the first source bio and the offset into that bio where
908 * the start of data to be cloned is located.
909 *
910 * On return, bio_src is updated to refer to the bio in the source
911 * chain that contains first un-cloned byte, and *offset will
912 * contain the offset of that byte within that bio.
913 */
914static struct bio *bio_chain_clone_range(struct bio **bio_src,
915 unsigned int *offset,
916 unsigned int len,
917 gfp_t gfpmask)
918{
919 struct bio *bi = *bio_src;
920 unsigned int off = *offset;
921 struct bio *chain = NULL;
922 struct bio **end;
923
924 /* Build up a chain of clone bios up to the limit */
925
926 if (!bi || off >= bi->bi_size || !len)
927 return NULL; /* Nothing to clone */
602adf40 928
f7760dad
AE
929 end = &chain;
930 while (len) {
931 unsigned int bi_size;
932 struct bio *bio;
933
934 if (!bi)
935 goto out_err; /* EINVAL; ran out of bio's */
936 bi_size = min_t(unsigned int, bi->bi_size - off, len);
937 bio = bio_clone_range(bi, off, bi_size, gfpmask);
938 if (!bio)
939 goto out_err; /* ENOMEM */
940
941 *end = bio;
942 end = &bio->bi_next;
602adf40 943
f7760dad
AE
944 off += bi_size;
945 if (off == bi->bi_size) {
946 bi = bi->bi_next;
947 off = 0;
948 }
949 len -= bi_size;
950 }
951 *bio_src = bi;
952 *offset = off;
953
954 return chain;
955out_err:
956 bio_chain_put(chain);
602adf40 957
602adf40
YS
958 return NULL;
959}
960
961/*
962 * helpers for osd request op vectors.
963 */
57cfc106
AE
964static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
965 int opcode, u32 payload_len)
602adf40 966{
57cfc106
AE
967 struct ceph_osd_req_op *ops;
968
969 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
970 if (!ops)
971 return NULL;
972
973 ops[0].op = opcode;
974
602adf40
YS
975 /*
976 * op extent offset and length will be set later on
977 * in calc_raw_layout()
978 */
57cfc106
AE
979 ops[0].payload_len = payload_len;
980
981 return ops;
602adf40
YS
982}
983
984static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
985{
986 kfree(ops);
987}
988
1fec7093
YS
989static void rbd_coll_end_req_index(struct request *rq,
990 struct rbd_req_coll *coll,
991 int index,
992 int ret, u64 len)
993{
994 struct request_queue *q;
995 int min, max, i;
996
bd919d45
AE
997 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
998 coll, index, ret, (unsigned long long) len);
1fec7093
YS
999
1000 if (!rq)
1001 return;
1002
1003 if (!coll) {
1004 blk_end_request(rq, ret, len);
1005 return;
1006 }
1007
1008 q = rq->q;
1009
1010 spin_lock_irq(q->queue_lock);
1011 coll->status[index].done = 1;
1012 coll->status[index].rc = ret;
1013 coll->status[index].bytes = len;
1014 max = min = coll->num_done;
1015 while (max < coll->total && coll->status[max].done)
1016 max++;
1017
1018 for (i = min; i<max; i++) {
1019 __blk_end_request(rq, coll->status[i].rc,
1020 coll->status[i].bytes);
1021 coll->num_done++;
1022 kref_put(&coll->kref, rbd_coll_release);
1023 }
1024 spin_unlock_irq(q->queue_lock);
1025}
1026
1027static void rbd_coll_end_req(struct rbd_request *req,
1028 int ret, u64 len)
1029{
1030 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
1031}
1032
602adf40
YS
1033/*
1034 * Send ceph osd request
1035 */
1036static int rbd_do_request(struct request *rq,
0ce1a794 1037 struct rbd_device *rbd_dev,
602adf40
YS
1038 struct ceph_snap_context *snapc,
1039 u64 snapid,
aded07ea 1040 const char *object_name, u64 ofs, u64 len,
602adf40
YS
1041 struct bio *bio,
1042 struct page **pages,
1043 int num_pages,
1044 int flags,
1045 struct ceph_osd_req_op *ops,
1fec7093
YS
1046 struct rbd_req_coll *coll,
1047 int coll_index,
602adf40 1048 void (*rbd_cb)(struct ceph_osd_request *req,
59c2be1e
YS
1049 struct ceph_msg *msg),
1050 struct ceph_osd_request **linger_req,
1051 u64 *ver)
602adf40
YS
1052{
1053 struct ceph_osd_request *req;
1054 struct ceph_file_layout *layout;
1055 int ret;
1056 u64 bno;
1057 struct timespec mtime = CURRENT_TIME;
1058 struct rbd_request *req_data;
1059 struct ceph_osd_request_head *reqhead;
1dbb4399 1060 struct ceph_osd_client *osdc;
602adf40 1061
602adf40 1062 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
1fec7093
YS
1063 if (!req_data) {
1064 if (coll)
1065 rbd_coll_end_req_index(rq, coll, coll_index,
1066 -ENOMEM, len);
1067 return -ENOMEM;
1068 }
1069
1070 if (coll) {
1071 req_data->coll = coll;
1072 req_data->coll_index = coll_index;
1073 }
602adf40 1074
f7760dad
AE
1075 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1076 object_name, (unsigned long long) ofs,
1077 (unsigned long long) len, coll, coll_index);
602adf40 1078
0ce1a794 1079 osdc = &rbd_dev->rbd_client->client->osdc;
1dbb4399
AE
1080 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1081 false, GFP_NOIO, pages, bio);
4ad12621 1082 if (!req) {
4ad12621 1083 ret = -ENOMEM;
602adf40
YS
1084 goto done_pages;
1085 }
1086
1087 req->r_callback = rbd_cb;
1088
1089 req_data->rq = rq;
1090 req_data->bio = bio;
1091 req_data->pages = pages;
1092 req_data->len = len;
1093
1094 req->r_priv = req_data;
1095
1096 reqhead = req->r_request->front.iov_base;
1097 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1098
aded07ea 1099 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
602adf40
YS
1100 req->r_oid_len = strlen(req->r_oid);
1101
1102 layout = &req->r_file_layout;
1103 memset(layout, 0, sizeof(*layout));
1104 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1105 layout->fl_stripe_count = cpu_to_le32(1);
1106 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
0d7dbfce 1107 layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->spec->pool_id);
6cae3717
SW
1108 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1109 req, ops);
1110 rbd_assert(ret == 0);
602adf40
YS
1111
1112 ceph_osdc_build_request(req, ofs, &len,
1113 ops,
1114 snapc,
1115 &mtime,
1116 req->r_oid, req->r_oid_len);
602adf40 1117
59c2be1e 1118 if (linger_req) {
1dbb4399 1119 ceph_osdc_set_request_linger(osdc, req);
59c2be1e
YS
1120 *linger_req = req;
1121 }
1122
1dbb4399 1123 ret = ceph_osdc_start_request(osdc, req, false);
602adf40
YS
1124 if (ret < 0)
1125 goto done_err;
1126
1127 if (!rbd_cb) {
1dbb4399 1128 ret = ceph_osdc_wait_request(osdc, req);
59c2be1e
YS
1129 if (ver)
1130 *ver = le64_to_cpu(req->r_reassert_version.version);
bd919d45
AE
1131 dout("reassert_ver=%llu\n",
1132 (unsigned long long)
1133 le64_to_cpu(req->r_reassert_version.version));
602adf40
YS
1134 ceph_osdc_put_request(req);
1135 }
1136 return ret;
1137
1138done_err:
1139 bio_chain_put(req_data->bio);
1140 ceph_osdc_put_request(req);
1141done_pages:
1fec7093 1142 rbd_coll_end_req(req_data, ret, len);
602adf40 1143 kfree(req_data);
602adf40
YS
1144 return ret;
1145}
1146
1147/*
1148 * Ceph osd op callback
1149 */
1150static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1151{
1152 struct rbd_request *req_data = req->r_priv;
1153 struct ceph_osd_reply_head *replyhead;
1154 struct ceph_osd_op *op;
1155 __s32 rc;
1156 u64 bytes;
1157 int read_op;
1158
1159 /* parse reply */
1160 replyhead = msg->front.iov_base;
1161 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1162 op = (void *)(replyhead + 1);
1163 rc = le32_to_cpu(replyhead->result);
1164 bytes = le64_to_cpu(op->extent.length);
895cfcc8 1165 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
602adf40 1166
bd919d45
AE
1167 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1168 (unsigned long long) bytes, read_op, (int) rc);
602adf40
YS
1169
1170 if (rc == -ENOENT && read_op) {
1171 zero_bio_chain(req_data->bio, 0);
1172 rc = 0;
1173 } else if (rc == 0 && read_op && bytes < req_data->len) {
1174 zero_bio_chain(req_data->bio, bytes);
1175 bytes = req_data->len;
1176 }
1177
1fec7093 1178 rbd_coll_end_req(req_data, rc, bytes);
602adf40
YS
1179
1180 if (req_data->bio)
1181 bio_chain_put(req_data->bio);
1182
1183 ceph_osdc_put_request(req);
1184 kfree(req_data);
1185}
1186
59c2be1e
YS
1187static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1188{
1189 ceph_osdc_put_request(req);
1190}
1191
602adf40
YS
1192/*
1193 * Do a synchronous ceph osd operation
1194 */
0ce1a794 1195static int rbd_req_sync_op(struct rbd_device *rbd_dev,
602adf40
YS
1196 struct ceph_snap_context *snapc,
1197 u64 snapid,
602adf40 1198 int flags,
913d2fdc 1199 struct ceph_osd_req_op *ops,
aded07ea 1200 const char *object_name,
f8d4de6e
AE
1201 u64 ofs, u64 inbound_size,
1202 char *inbound,
59c2be1e
YS
1203 struct ceph_osd_request **linger_req,
1204 u64 *ver)
602adf40
YS
1205{
1206 int ret;
1207 struct page **pages;
1208 int num_pages;
913d2fdc 1209
aafb230e 1210 rbd_assert(ops != NULL);
602adf40 1211
f8d4de6e 1212 num_pages = calc_pages_for(ofs, inbound_size);
602adf40 1213 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
b8d0638a
DC
1214 if (IS_ERR(pages))
1215 return PTR_ERR(pages);
602adf40 1216
0ce1a794 1217 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
f8d4de6e 1218 object_name, ofs, inbound_size, NULL,
602adf40
YS
1219 pages, num_pages,
1220 flags,
1221 ops,
1fec7093 1222 NULL, 0,
59c2be1e
YS
1223 NULL,
1224 linger_req, ver);
602adf40 1225 if (ret < 0)
913d2fdc 1226 goto done;
602adf40 1227
f8d4de6e
AE
1228 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1229 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
602adf40 1230
602adf40
YS
1231done:
1232 ceph_release_page_vector(pages, num_pages);
1233 return ret;
1234}
1235
1236/*
1237 * Do an asynchronous ceph osd operation
1238 */
1239static int rbd_do_op(struct request *rq,
0ce1a794 1240 struct rbd_device *rbd_dev,
602adf40 1241 struct ceph_snap_context *snapc,
602adf40 1242 u64 ofs, u64 len,
1fec7093
YS
1243 struct bio *bio,
1244 struct rbd_req_coll *coll,
1245 int coll_index)
602adf40
YS
1246{
1247 char *seg_name;
1248 u64 seg_ofs;
1249 u64 seg_len;
1250 int ret;
1251 struct ceph_osd_req_op *ops;
1252 u32 payload_len;
ff2e4bb5
AE
1253 int opcode;
1254 int flags;
4634246d 1255 u64 snapid;
602adf40 1256
65ccfe21 1257 seg_name = rbd_segment_name(rbd_dev, ofs);
602adf40
YS
1258 if (!seg_name)
1259 return -ENOMEM;
65ccfe21
AE
1260 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1261 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
602adf40 1262
ff2e4bb5
AE
1263 if (rq_data_dir(rq) == WRITE) {
1264 opcode = CEPH_OSD_OP_WRITE;
1265 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
4634246d 1266 snapid = CEPH_NOSNAP;
ff2e4bb5
AE
1267 payload_len = seg_len;
1268 } else {
1269 opcode = CEPH_OSD_OP_READ;
1270 flags = CEPH_OSD_FLAG_READ;
4634246d 1271 snapc = NULL;
0d7dbfce 1272 snapid = rbd_dev->spec->snap_id;
ff2e4bb5
AE
1273 payload_len = 0;
1274 }
602adf40 1275
57cfc106
AE
1276 ret = -ENOMEM;
1277 ops = rbd_create_rw_ops(1, opcode, payload_len);
1278 if (!ops)
602adf40
YS
1279 goto done;
1280
1281 /* we've taken care of segment sizes earlier when we
1282 cloned the bios. We should never have a segment
1283 truncated at this point */
aafb230e 1284 rbd_assert(seg_len == len);
602adf40
YS
1285
1286 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1287 seg_name, seg_ofs, seg_len,
1288 bio,
1289 NULL, 0,
1290 flags,
1291 ops,
1fec7093 1292 coll, coll_index,
59c2be1e 1293 rbd_req_cb, 0, NULL);
11f77002
SW
1294
1295 rbd_destroy_ops(ops);
602adf40
YS
1296done:
1297 kfree(seg_name);
1298 return ret;
1299}
1300
602adf40
YS
1301/*
1302 * Request sync osd read
1303 */
0ce1a794 1304static int rbd_req_sync_read(struct rbd_device *rbd_dev,
602adf40 1305 u64 snapid,
aded07ea 1306 const char *object_name,
602adf40 1307 u64 ofs, u64 len,
59c2be1e
YS
1308 char *buf,
1309 u64 *ver)
602adf40 1310{
913d2fdc
AE
1311 struct ceph_osd_req_op *ops;
1312 int ret;
1313
1314 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1315 if (!ops)
1316 return -ENOMEM;
1317
1318 ret = rbd_req_sync_op(rbd_dev, NULL,
b06e6a6b 1319 snapid,
602adf40 1320 CEPH_OSD_FLAG_READ,
913d2fdc
AE
1321 ops, object_name, ofs, len, buf, NULL, ver);
1322 rbd_destroy_ops(ops);
1323
1324 return ret;
602adf40
YS
1325}
1326
1327/*
59c2be1e
YS
1328 * Request sync osd watch
1329 */
0ce1a794 1330static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
59c2be1e 1331 u64 ver,
7f0a24d8 1332 u64 notify_id)
59c2be1e
YS
1333{
1334 struct ceph_osd_req_op *ops;
11f77002
SW
1335 int ret;
1336
57cfc106
AE
1337 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1338 if (!ops)
1339 return -ENOMEM;
59c2be1e 1340
a71b891b 1341 ops[0].watch.ver = cpu_to_le64(ver);
59c2be1e
YS
1342 ops[0].watch.cookie = notify_id;
1343 ops[0].watch.flag = 0;
1344
0ce1a794 1345 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
7f0a24d8 1346 rbd_dev->header_name, 0, 0, NULL,
ad4f232f 1347 NULL, 0,
59c2be1e
YS
1348 CEPH_OSD_FLAG_READ,
1349 ops,
1fec7093 1350 NULL, 0,
59c2be1e
YS
1351 rbd_simple_req_cb, 0, NULL);
1352
1353 rbd_destroy_ops(ops);
1354 return ret;
1355}
1356
1357static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1358{
0ce1a794 1359 struct rbd_device *rbd_dev = (struct rbd_device *)data;
a71b891b 1360 u64 hver;
13143d2d
SW
1361 int rc;
1362
0ce1a794 1363 if (!rbd_dev)
59c2be1e
YS
1364 return;
1365
bd919d45
AE
1366 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1367 rbd_dev->header_name, (unsigned long long) notify_id,
1368 (unsigned int) opcode);
117973fb 1369 rc = rbd_dev_refresh(rbd_dev, &hver);
13143d2d 1370 if (rc)
f0f8cef5 1371 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
0ce1a794 1372 " update snaps: %d\n", rbd_dev->major, rc);
59c2be1e 1373
7f0a24d8 1374 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
59c2be1e
YS
1375}
1376
1377/*
1378 * Request sync osd watch
1379 */
0e6f322d 1380static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
59c2be1e
YS
1381{
1382 struct ceph_osd_req_op *ops;
0ce1a794 1383 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
57cfc106 1384 int ret;
59c2be1e 1385
57cfc106
AE
1386 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1387 if (!ops)
1388 return -ENOMEM;
59c2be1e
YS
1389
1390 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
0ce1a794 1391 (void *)rbd_dev, &rbd_dev->watch_event);
59c2be1e
YS
1392 if (ret < 0)
1393 goto fail;
1394
0e6f322d 1395 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
0ce1a794 1396 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
59c2be1e
YS
1397 ops[0].watch.flag = 1;
1398
0ce1a794 1399 ret = rbd_req_sync_op(rbd_dev, NULL,
59c2be1e 1400 CEPH_NOSNAP,
59c2be1e
YS
1401 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1402 ops,
0e6f322d
AE
1403 rbd_dev->header_name,
1404 0, 0, NULL,
0ce1a794 1405 &rbd_dev->watch_request, NULL);
59c2be1e
YS
1406
1407 if (ret < 0)
1408 goto fail_event;
1409
1410 rbd_destroy_ops(ops);
1411 return 0;
1412
1413fail_event:
0ce1a794
AE
1414 ceph_osdc_cancel_event(rbd_dev->watch_event);
1415 rbd_dev->watch_event = NULL;
59c2be1e
YS
1416fail:
1417 rbd_destroy_ops(ops);
1418 return ret;
1419}
1420
79e3057c
YS
1421/*
1422 * Request sync osd unwatch
1423 */
070c633f 1424static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
79e3057c
YS
1425{
1426 struct ceph_osd_req_op *ops;
57cfc106 1427 int ret;
79e3057c 1428
57cfc106
AE
1429 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1430 if (!ops)
1431 return -ENOMEM;
79e3057c
YS
1432
1433 ops[0].watch.ver = 0;
0ce1a794 1434 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
79e3057c
YS
1435 ops[0].watch.flag = 0;
1436
0ce1a794 1437 ret = rbd_req_sync_op(rbd_dev, NULL,
79e3057c 1438 CEPH_NOSNAP,
79e3057c
YS
1439 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1440 ops,
070c633f
AE
1441 rbd_dev->header_name,
1442 0, 0, NULL, NULL, NULL);
1443
79e3057c
YS
1444
1445 rbd_destroy_ops(ops);
0ce1a794
AE
1446 ceph_osdc_cancel_event(rbd_dev->watch_event);
1447 rbd_dev->watch_event = NULL;
79e3057c
YS
1448 return ret;
1449}
1450
602adf40 1451/*
3cb4a687 1452 * Synchronous osd object method call
602adf40 1453 */
0ce1a794 1454static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
aded07ea
AE
1455 const char *object_name,
1456 const char *class_name,
1457 const char *method_name,
3cb4a687
AE
1458 const char *outbound,
1459 size_t outbound_size,
f8d4de6e
AE
1460 char *inbound,
1461 size_t inbound_size,
3cb4a687 1462 int flags,
59c2be1e 1463 u64 *ver)
602adf40
YS
1464{
1465 struct ceph_osd_req_op *ops;
aded07ea
AE
1466 int class_name_len = strlen(class_name);
1467 int method_name_len = strlen(method_name);
3cb4a687 1468 int payload_size;
57cfc106
AE
1469 int ret;
1470
3cb4a687
AE
1471 /*
1472 * Any input parameters required by the method we're calling
1473 * will be sent along with the class and method names as
1474 * part of the message payload. That data and its size are
1475 * supplied via the indata and indata_len fields (named from
1476 * the perspective of the server side) in the OSD request
1477 * operation.
1478 */
1479 payload_size = class_name_len + method_name_len + outbound_size;
1480 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
57cfc106
AE
1481 if (!ops)
1482 return -ENOMEM;
602adf40 1483
aded07ea
AE
1484 ops[0].cls.class_name = class_name;
1485 ops[0].cls.class_len = (__u8) class_name_len;
1486 ops[0].cls.method_name = method_name;
1487 ops[0].cls.method_len = (__u8) method_name_len;
602adf40 1488 ops[0].cls.argc = 0;
3cb4a687
AE
1489 ops[0].cls.indata = outbound;
1490 ops[0].cls.indata_len = outbound_size;
602adf40 1491
0ce1a794 1492 ret = rbd_req_sync_op(rbd_dev, NULL,
602adf40 1493 CEPH_NOSNAP,
3cb4a687 1494 flags, ops,
f8d4de6e
AE
1495 object_name, 0, inbound_size, inbound,
1496 NULL, ver);
602adf40
YS
1497
1498 rbd_destroy_ops(ops);
1499
1500 dout("cls_exec returned %d\n", ret);
1501 return ret;
1502}
1503
1fec7093
YS
1504static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1505{
1506 struct rbd_req_coll *coll =
1507 kzalloc(sizeof(struct rbd_req_coll) +
1508 sizeof(struct rbd_req_status) * num_reqs,
1509 GFP_ATOMIC);
1510
1511 if (!coll)
1512 return NULL;
1513 coll->total = num_reqs;
1514 kref_init(&coll->kref);
1515 return coll;
1516}
1517
602adf40
YS
1518/*
1519 * block device queue callback
1520 */
1521static void rbd_rq_fn(struct request_queue *q)
1522{
1523 struct rbd_device *rbd_dev = q->queuedata;
1524 struct request *rq;
602adf40 1525
00f1f36f 1526 while ((rq = blk_fetch_request(q))) {
602adf40 1527 struct bio *bio;
602adf40 1528 bool do_write;
bd919d45 1529 unsigned int size;
602adf40 1530 u64 ofs;
1fec7093
YS
1531 int num_segs, cur_seg = 0;
1532 struct rbd_req_coll *coll;
d1d25646 1533 struct ceph_snap_context *snapc;
f7760dad 1534 unsigned int bio_offset;
602adf40 1535
602adf40
YS
1536 dout("fetched request\n");
1537
1538 /* filter out block requests we don't understand */
1539 if ((rq->cmd_type != REQ_TYPE_FS)) {
1540 __blk_end_request_all(rq, 0);
00f1f36f 1541 continue;
602adf40
YS
1542 }
1543
1544 /* deduce our operation (read, write) */
1545 do_write = (rq_data_dir(rq) == WRITE);
f84344f3 1546 if (do_write && rbd_dev->mapping.read_only) {
602adf40 1547 __blk_end_request_all(rq, -EROFS);
00f1f36f 1548 continue;
602adf40
YS
1549 }
1550
1551 spin_unlock_irq(q->queue_lock);
1552
d1d25646 1553 down_read(&rbd_dev->header_rwsem);
e88a36ec 1554
daba5fdb 1555 if (!rbd_dev->exists) {
0d7dbfce 1556 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
e88a36ec 1557 up_read(&rbd_dev->header_rwsem);
d1d25646
JD
1558 dout("request for non-existent snapshot");
1559 spin_lock_irq(q->queue_lock);
1560 __blk_end_request_all(rq, -ENXIO);
1561 continue;
e88a36ec
JD
1562 }
1563
d1d25646
JD
1564 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1565
1566 up_read(&rbd_dev->header_rwsem);
1567
f7760dad
AE
1568 size = blk_rq_bytes(rq);
1569 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1570 bio = rq->bio;
1571
602adf40
YS
1572 dout("%s 0x%x bytes at 0x%llx\n",
1573 do_write ? "write" : "read",
bd919d45 1574 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
602adf40 1575
1fec7093 1576 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
df111be6
AE
1577 if (num_segs <= 0) {
1578 spin_lock_irq(q->queue_lock);
1579 __blk_end_request_all(rq, num_segs);
1580 ceph_put_snap_context(snapc);
1581 continue;
1582 }
1fec7093
YS
1583 coll = rbd_alloc_coll(num_segs);
1584 if (!coll) {
1585 spin_lock_irq(q->queue_lock);
1586 __blk_end_request_all(rq, -ENOMEM);
d1d25646 1587 ceph_put_snap_context(snapc);
00f1f36f 1588 continue;
1fec7093
YS
1589 }
1590
f7760dad 1591 bio_offset = 0;
602adf40 1592 do {
f7760dad
AE
1593 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1594 unsigned int chain_size;
1595 struct bio *bio_chain;
1596
1597 BUG_ON(limit > (u64) UINT_MAX);
1598 chain_size = (unsigned int) limit;
bd919d45 1599 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
f7760dad 1600
1fec7093 1601 kref_get(&coll->kref);
f7760dad
AE
1602
1603 /* Pass a cloned bio chain via an osd request */
1604
1605 bio_chain = bio_chain_clone_range(&bio,
1606 &bio_offset, chain_size,
1607 GFP_ATOMIC);
1608 if (bio_chain)
4634246d 1609 (void) rbd_do_op(rq, rbd_dev, snapc,
f7760dad
AE
1610 ofs, chain_size,
1611 bio_chain, coll, cur_seg);
4634246d 1612 else
1fec7093 1613 rbd_coll_end_req_index(rq, coll, cur_seg,
f7760dad
AE
1614 -ENOMEM, chain_size);
1615 size -= chain_size;
1616 ofs += chain_size;
602adf40 1617
1fec7093 1618 cur_seg++;
602adf40 1619 } while (size > 0);
1fec7093 1620 kref_put(&coll->kref, rbd_coll_release);
602adf40 1621
602adf40 1622 spin_lock_irq(q->queue_lock);
d1d25646
JD
1623
1624 ceph_put_snap_context(snapc);
602adf40
YS
1625 }
1626}
1627
1628/*
1629 * a queue callback. Makes sure that we don't create a bio that spans across
1630 * multiple osd objects. One exception would be with a single page bios,
f7760dad 1631 * which we handle later at bio_chain_clone_range()
602adf40
YS
1632 */
1633static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1634 struct bio_vec *bvec)
1635{
1636 struct rbd_device *rbd_dev = q->queuedata;
e5cfeed2
AE
1637 sector_t sector_offset;
1638 sector_t sectors_per_obj;
1639 sector_t obj_sector_offset;
1640 int ret;
1641
1642 /*
1643 * Find how far into its rbd object the partition-relative
1644 * bio start sector is to offset relative to the enclosing
1645 * device.
1646 */
1647 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1648 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1649 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
1650
1651 /*
1652 * Compute the number of bytes from that offset to the end
1653 * of the object. Account for what's already used by the bio.
1654 */
1655 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1656 if (ret > bmd->bi_size)
1657 ret -= bmd->bi_size;
1658 else
1659 ret = 0;
1660
1661 /*
1662 * Don't send back more than was asked for. And if the bio
1663 * was empty, let the whole thing through because: "Note
1664 * that a block device *must* allow a single page to be
1665 * added to an empty bio."
1666 */
1667 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1668 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1669 ret = (int) bvec->bv_len;
1670
1671 return ret;
602adf40
YS
1672}
1673
1674static void rbd_free_disk(struct rbd_device *rbd_dev)
1675{
1676 struct gendisk *disk = rbd_dev->disk;
1677
1678 if (!disk)
1679 return;
1680
602adf40
YS
1681 if (disk->flags & GENHD_FL_UP)
1682 del_gendisk(disk);
1683 if (disk->queue)
1684 blk_cleanup_queue(disk->queue);
1685 put_disk(disk);
1686}
1687
1688/*
4156d998
AE
1689 * Read the complete header for the given rbd device.
1690 *
1691 * Returns a pointer to a dynamically-allocated buffer containing
1692 * the complete and validated header. Caller can pass the address
1693 * of a variable that will be filled in with the version of the
1694 * header object at the time it was read.
1695 *
1696 * Returns a pointer-coded errno if a failure occurs.
602adf40 1697 */
4156d998
AE
1698static struct rbd_image_header_ondisk *
1699rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
602adf40 1700{
4156d998 1701 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 1702 u32 snap_count = 0;
4156d998
AE
1703 u64 names_size = 0;
1704 u32 want_count;
1705 int ret;
602adf40 1706
00f1f36f 1707 /*
4156d998
AE
1708 * The complete header will include an array of its 64-bit
1709 * snapshot ids, followed by the names of those snapshots as
1710 * a contiguous block of NUL-terminated strings. Note that
1711 * the number of snapshots could change by the time we read
1712 * it in, in which case we re-read it.
00f1f36f 1713 */
4156d998
AE
1714 do {
1715 size_t size;
1716
1717 kfree(ondisk);
1718
1719 size = sizeof (*ondisk);
1720 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1721 size += names_size;
1722 ondisk = kmalloc(size, GFP_KERNEL);
1723 if (!ondisk)
1724 return ERR_PTR(-ENOMEM);
1725
1726 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
0bed54dc 1727 rbd_dev->header_name,
4156d998
AE
1728 0, size,
1729 (char *) ondisk, version);
1730
1731 if (ret < 0)
1732 goto out_err;
1733 if (WARN_ON((size_t) ret < size)) {
1734 ret = -ENXIO;
1735 pr_warning("short header read for image %s"
1736 " (want %zd got %d)\n",
0d7dbfce 1737 rbd_dev->spec->image_name, size, ret);
4156d998
AE
1738 goto out_err;
1739 }
1740 if (!rbd_dev_ondisk_valid(ondisk)) {
1741 ret = -ENXIO;
1742 pr_warning("invalid header for image %s\n",
0d7dbfce 1743 rbd_dev->spec->image_name);
4156d998 1744 goto out_err;
81e759fb 1745 }
602adf40 1746
4156d998
AE
1747 names_size = le64_to_cpu(ondisk->snap_names_len);
1748 want_count = snap_count;
1749 snap_count = le32_to_cpu(ondisk->snap_count);
1750 } while (snap_count != want_count);
00f1f36f 1751
4156d998 1752 return ondisk;
00f1f36f 1753
4156d998
AE
1754out_err:
1755 kfree(ondisk);
1756
1757 return ERR_PTR(ret);
1758}
1759
1760/*
1761 * reload the ondisk the header
1762 */
1763static int rbd_read_header(struct rbd_device *rbd_dev,
1764 struct rbd_image_header *header)
1765{
1766 struct rbd_image_header_ondisk *ondisk;
1767 u64 ver = 0;
1768 int ret;
602adf40 1769
4156d998
AE
1770 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1771 if (IS_ERR(ondisk))
1772 return PTR_ERR(ondisk);
1773 ret = rbd_header_from_disk(header, ondisk);
1774 if (ret >= 0)
1775 header->obj_version = ver;
1776 kfree(ondisk);
1777
1778 return ret;
602adf40
YS
1779}
1780
41f38c2b 1781static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
dfc5606d
YS
1782{
1783 struct rbd_snap *snap;
a0593290 1784 struct rbd_snap *next;
dfc5606d 1785
a0593290 1786 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
41f38c2b 1787 rbd_remove_snap_dev(snap);
dfc5606d
YS
1788}
1789
9478554a
AE
1790static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1791{
1792 sector_t size;
1793
0d7dbfce 1794 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
9478554a
AE
1795 return;
1796
1797 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1798 dout("setting size to %llu sectors", (unsigned long long) size);
1799 rbd_dev->mapping.size = (u64) size;
1800 set_capacity(rbd_dev->disk, size);
1801}
1802
602adf40
YS
1803/*
1804 * only read the first part of the ondisk header, without the snaps info
1805 */
117973fb 1806static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
602adf40
YS
1807{
1808 int ret;
1809 struct rbd_image_header h;
602adf40
YS
1810
1811 ret = rbd_read_header(rbd_dev, &h);
1812 if (ret < 0)
1813 return ret;
1814
a51aa0c0
JD
1815 down_write(&rbd_dev->header_rwsem);
1816
9478554a
AE
1817 /* Update image size, and check for resize of mapped image */
1818 rbd_dev->header.image_size = h.image_size;
1819 rbd_update_mapping_size(rbd_dev);
9db4b3e3 1820
849b4260 1821 /* rbd_dev->header.object_prefix shouldn't change */
602adf40 1822 kfree(rbd_dev->header.snap_sizes);
849b4260 1823 kfree(rbd_dev->header.snap_names);
d1d25646
JD
1824 /* osd requests may still refer to snapc */
1825 ceph_put_snap_context(rbd_dev->header.snapc);
602adf40 1826
b813623a
AE
1827 if (hver)
1828 *hver = h.obj_version;
a71b891b 1829 rbd_dev->header.obj_version = h.obj_version;
93a24e08 1830 rbd_dev->header.image_size = h.image_size;
602adf40
YS
1831 rbd_dev->header.snapc = h.snapc;
1832 rbd_dev->header.snap_names = h.snap_names;
1833 rbd_dev->header.snap_sizes = h.snap_sizes;
849b4260
AE
1834 /* Free the extra copy of the object prefix */
1835 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1836 kfree(h.object_prefix);
1837
304f6808
AE
1838 ret = rbd_dev_snaps_update(rbd_dev);
1839 if (!ret)
1840 ret = rbd_dev_snaps_register(rbd_dev);
dfc5606d 1841
c666601a 1842 up_write(&rbd_dev->header_rwsem);
602adf40 1843
dfc5606d 1844 return ret;
602adf40
YS
1845}
1846
117973fb 1847static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
1fe5e993
AE
1848{
1849 int ret;
1850
117973fb 1851 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1fe5e993 1852 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
117973fb
AE
1853 if (rbd_dev->image_format == 1)
1854 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1855 else
1856 ret = rbd_dev_v2_refresh(rbd_dev, hver);
1fe5e993
AE
1857 mutex_unlock(&ctl_mutex);
1858
1859 return ret;
1860}
1861
602adf40
YS
1862static int rbd_init_disk(struct rbd_device *rbd_dev)
1863{
1864 struct gendisk *disk;
1865 struct request_queue *q;
593a9e7b 1866 u64 segment_size;
602adf40 1867
602adf40 1868 /* create gendisk info */
602adf40
YS
1869 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1870 if (!disk)
1fcdb8aa 1871 return -ENOMEM;
602adf40 1872
f0f8cef5 1873 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 1874 rbd_dev->dev_id);
602adf40
YS
1875 disk->major = rbd_dev->major;
1876 disk->first_minor = 0;
1877 disk->fops = &rbd_bd_ops;
1878 disk->private_data = rbd_dev;
1879
1880 /* init rq */
602adf40
YS
1881 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1882 if (!q)
1883 goto out_disk;
029bcbd8 1884
593a9e7b
AE
1885 /* We use the default size, but let's be explicit about it. */
1886 blk_queue_physical_block_size(q, SECTOR_SIZE);
1887
029bcbd8 1888 /* set io sizes to object size */
593a9e7b
AE
1889 segment_size = rbd_obj_bytes(&rbd_dev->header);
1890 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1891 blk_queue_max_segment_size(q, segment_size);
1892 blk_queue_io_min(q, segment_size);
1893 blk_queue_io_opt(q, segment_size);
029bcbd8 1894
602adf40
YS
1895 blk_queue_merge_bvec(q, rbd_merge_bvec);
1896 disk->queue = q;
1897
1898 q->queuedata = rbd_dev;
1899
1900 rbd_dev->disk = disk;
602adf40 1901
12f02944
AE
1902 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1903
602adf40 1904 return 0;
602adf40
YS
1905out_disk:
1906 put_disk(disk);
1fcdb8aa
AE
1907
1908 return -ENOMEM;
602adf40
YS
1909}
1910
dfc5606d
YS
1911/*
1912 sysfs
1913*/
1914
593a9e7b
AE
1915static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1916{
1917 return container_of(dev, struct rbd_device, dev);
1918}
1919
dfc5606d
YS
1920static ssize_t rbd_size_show(struct device *dev,
1921 struct device_attribute *attr, char *buf)
1922{
593a9e7b 1923 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0
JD
1924 sector_t size;
1925
1926 down_read(&rbd_dev->header_rwsem);
1927 size = get_capacity(rbd_dev->disk);
1928 up_read(&rbd_dev->header_rwsem);
dfc5606d 1929
a51aa0c0 1930 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
dfc5606d
YS
1931}
1932
34b13184
AE
1933/*
1934 * Note this shows the features for whatever's mapped, which is not
1935 * necessarily the base image.
1936 */
1937static ssize_t rbd_features_show(struct device *dev,
1938 struct device_attribute *attr, char *buf)
1939{
1940 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1941
1942 return sprintf(buf, "0x%016llx\n",
1943 (unsigned long long) rbd_dev->mapping.features);
1944}
1945
dfc5606d
YS
1946static ssize_t rbd_major_show(struct device *dev,
1947 struct device_attribute *attr, char *buf)
1948{
593a9e7b 1949 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 1950
dfc5606d
YS
1951 return sprintf(buf, "%d\n", rbd_dev->major);
1952}
1953
1954static ssize_t rbd_client_id_show(struct device *dev,
1955 struct device_attribute *attr, char *buf)
602adf40 1956{
593a9e7b 1957 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1958
1dbb4399
AE
1959 return sprintf(buf, "client%lld\n",
1960 ceph_client_id(rbd_dev->rbd_client->client));
602adf40
YS
1961}
1962
dfc5606d
YS
1963static ssize_t rbd_pool_show(struct device *dev,
1964 struct device_attribute *attr, char *buf)
602adf40 1965{
593a9e7b 1966 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1967
0d7dbfce 1968 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
dfc5606d
YS
1969}
1970
9bb2f334
AE
1971static ssize_t rbd_pool_id_show(struct device *dev,
1972 struct device_attribute *attr, char *buf)
1973{
1974 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1975
0d7dbfce
AE
1976 return sprintf(buf, "%llu\n",
1977 (unsigned long long) rbd_dev->spec->pool_id);
9bb2f334
AE
1978}
1979
dfc5606d
YS
1980static ssize_t rbd_name_show(struct device *dev,
1981 struct device_attribute *attr, char *buf)
1982{
593a9e7b 1983 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1984
a92ffdf8
AE
1985 if (rbd_dev->spec->image_name)
1986 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
1987
1988 return sprintf(buf, "(unknown)\n");
dfc5606d
YS
1989}
1990
589d30e0
AE
1991static ssize_t rbd_image_id_show(struct device *dev,
1992 struct device_attribute *attr, char *buf)
1993{
1994 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1995
0d7dbfce 1996 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
589d30e0
AE
1997}
1998
34b13184
AE
1999/*
2000 * Shows the name of the currently-mapped snapshot (or
2001 * RBD_SNAP_HEAD_NAME for the base image).
2002 */
dfc5606d
YS
2003static ssize_t rbd_snap_show(struct device *dev,
2004 struct device_attribute *attr,
2005 char *buf)
2006{
593a9e7b 2007 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2008
0d7dbfce 2009 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
dfc5606d
YS
2010}
2011
2012static ssize_t rbd_image_refresh(struct device *dev,
2013 struct device_attribute *attr,
2014 const char *buf,
2015 size_t size)
2016{
593a9e7b 2017 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 2018 int ret;
602adf40 2019
117973fb 2020 ret = rbd_dev_refresh(rbd_dev, NULL);
b813623a
AE
2021
2022 return ret < 0 ? ret : size;
dfc5606d 2023}
602adf40 2024
dfc5606d 2025static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
34b13184 2026static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
dfc5606d
YS
2027static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2028static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2029static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
9bb2f334 2030static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
dfc5606d 2031static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
589d30e0 2032static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
dfc5606d
YS
2033static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2034static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
dfc5606d
YS
2035
2036static struct attribute *rbd_attrs[] = {
2037 &dev_attr_size.attr,
34b13184 2038 &dev_attr_features.attr,
dfc5606d
YS
2039 &dev_attr_major.attr,
2040 &dev_attr_client_id.attr,
2041 &dev_attr_pool.attr,
9bb2f334 2042 &dev_attr_pool_id.attr,
dfc5606d 2043 &dev_attr_name.attr,
589d30e0 2044 &dev_attr_image_id.attr,
dfc5606d
YS
2045 &dev_attr_current_snap.attr,
2046 &dev_attr_refresh.attr,
dfc5606d
YS
2047 NULL
2048};
2049
2050static struct attribute_group rbd_attr_group = {
2051 .attrs = rbd_attrs,
2052};
2053
2054static const struct attribute_group *rbd_attr_groups[] = {
2055 &rbd_attr_group,
2056 NULL
2057};
2058
2059static void rbd_sysfs_dev_release(struct device *dev)
2060{
2061}
2062
2063static struct device_type rbd_device_type = {
2064 .name = "rbd",
2065 .groups = rbd_attr_groups,
2066 .release = rbd_sysfs_dev_release,
2067};
2068
2069
2070/*
2071 sysfs - snapshots
2072*/
2073
2074static ssize_t rbd_snap_size_show(struct device *dev,
2075 struct device_attribute *attr,
2076 char *buf)
2077{
2078 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2079
3591538f 2080 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
dfc5606d
YS
2081}
2082
2083static ssize_t rbd_snap_id_show(struct device *dev,
2084 struct device_attribute *attr,
2085 char *buf)
2086{
2087 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2088
3591538f 2089 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
dfc5606d
YS
2090}
2091
34b13184
AE
2092static ssize_t rbd_snap_features_show(struct device *dev,
2093 struct device_attribute *attr,
2094 char *buf)
2095{
2096 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2097
2098 return sprintf(buf, "0x%016llx\n",
2099 (unsigned long long) snap->features);
2100}
2101
dfc5606d
YS
2102static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2103static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
34b13184 2104static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
dfc5606d
YS
2105
2106static struct attribute *rbd_snap_attrs[] = {
2107 &dev_attr_snap_size.attr,
2108 &dev_attr_snap_id.attr,
34b13184 2109 &dev_attr_snap_features.attr,
dfc5606d
YS
2110 NULL,
2111};
2112
2113static struct attribute_group rbd_snap_attr_group = {
2114 .attrs = rbd_snap_attrs,
2115};
2116
2117static void rbd_snap_dev_release(struct device *dev)
2118{
2119 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2120 kfree(snap->name);
2121 kfree(snap);
2122}
2123
2124static const struct attribute_group *rbd_snap_attr_groups[] = {
2125 &rbd_snap_attr_group,
2126 NULL
2127};
2128
2129static struct device_type rbd_snap_device_type = {
2130 .groups = rbd_snap_attr_groups,
2131 .release = rbd_snap_dev_release,
2132};
2133
8b8fb99c
AE
2134static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2135{
2136 kref_get(&spec->kref);
2137
2138 return spec;
2139}
2140
2141static void rbd_spec_free(struct kref *kref);
2142static void rbd_spec_put(struct rbd_spec *spec)
2143{
2144 if (spec)
2145 kref_put(&spec->kref, rbd_spec_free);
2146}
2147
2148static struct rbd_spec *rbd_spec_alloc(void)
2149{
2150 struct rbd_spec *spec;
2151
2152 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2153 if (!spec)
2154 return NULL;
2155 kref_init(&spec->kref);
2156
2157 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2158
2159 return spec;
2160}
2161
2162static void rbd_spec_free(struct kref *kref)
2163{
2164 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2165
2166 kfree(spec->pool_name);
2167 kfree(spec->image_id);
2168 kfree(spec->image_name);
2169 kfree(spec->snap_name);
2170 kfree(spec);
2171}
2172
c53d5893
AE
2173struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2174 struct rbd_spec *spec)
2175{
2176 struct rbd_device *rbd_dev;
2177
2178 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2179 if (!rbd_dev)
2180 return NULL;
2181
2182 spin_lock_init(&rbd_dev->lock);
2183 INIT_LIST_HEAD(&rbd_dev->node);
2184 INIT_LIST_HEAD(&rbd_dev->snaps);
2185 init_rwsem(&rbd_dev->header_rwsem);
2186
2187 rbd_dev->spec = spec;
2188 rbd_dev->rbd_client = rbdc;
2189
2190 return rbd_dev;
2191}
2192
2193static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2194{
2195 kfree(rbd_dev->header_name);
2196 rbd_put_client(rbd_dev->rbd_client);
2197 rbd_spec_put(rbd_dev->spec);
2198 kfree(rbd_dev);
2199}
2200
304f6808
AE
2201static bool rbd_snap_registered(struct rbd_snap *snap)
2202{
2203 bool ret = snap->dev.type == &rbd_snap_device_type;
2204 bool reg = device_is_registered(&snap->dev);
2205
2206 rbd_assert(!ret ^ reg);
2207
2208 return ret;
2209}
2210
41f38c2b 2211static void rbd_remove_snap_dev(struct rbd_snap *snap)
dfc5606d
YS
2212{
2213 list_del(&snap->node);
304f6808
AE
2214 if (device_is_registered(&snap->dev))
2215 device_unregister(&snap->dev);
dfc5606d
YS
2216}
2217
14e7085d 2218static int rbd_register_snap_dev(struct rbd_snap *snap,
dfc5606d
YS
2219 struct device *parent)
2220{
2221 struct device *dev = &snap->dev;
2222 int ret;
2223
2224 dev->type = &rbd_snap_device_type;
2225 dev->parent = parent;
2226 dev->release = rbd_snap_dev_release;
d4b125e9 2227 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
304f6808
AE
2228 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2229
dfc5606d
YS
2230 ret = device_register(dev);
2231
2232 return ret;
2233}
2234
4e891e0a 2235static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
c8d18425 2236 const char *snap_name,
34b13184
AE
2237 u64 snap_id, u64 snap_size,
2238 u64 snap_features)
dfc5606d 2239{
4e891e0a 2240 struct rbd_snap *snap;
dfc5606d 2241 int ret;
4e891e0a
AE
2242
2243 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
dfc5606d 2244 if (!snap)
4e891e0a
AE
2245 return ERR_PTR(-ENOMEM);
2246
2247 ret = -ENOMEM;
c8d18425 2248 snap->name = kstrdup(snap_name, GFP_KERNEL);
4e891e0a
AE
2249 if (!snap->name)
2250 goto err;
2251
c8d18425
AE
2252 snap->id = snap_id;
2253 snap->size = snap_size;
34b13184 2254 snap->features = snap_features;
4e891e0a
AE
2255
2256 return snap;
2257
dfc5606d
YS
2258err:
2259 kfree(snap->name);
2260 kfree(snap);
4e891e0a
AE
2261
2262 return ERR_PTR(ret);
dfc5606d
YS
2263}
2264
cd892126
AE
2265static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2266 u64 *snap_size, u64 *snap_features)
2267{
2268 char *snap_name;
2269
2270 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2271
2272 *snap_size = rbd_dev->header.snap_sizes[which];
2273 *snap_features = 0; /* No features for v1 */
2274
2275 /* Skip over names until we find the one we are looking for */
2276
2277 snap_name = rbd_dev->header.snap_names;
2278 while (which--)
2279 snap_name += strlen(snap_name) + 1;
2280
2281 return snap_name;
2282}
2283
9d475de5
AE
2284/*
2285 * Get the size and object order for an image snapshot, or if
2286 * snap_id is CEPH_NOSNAP, gets this information for the base
2287 * image.
2288 */
2289static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2290 u8 *order, u64 *snap_size)
2291{
2292 __le64 snapid = cpu_to_le64(snap_id);
2293 int ret;
2294 struct {
2295 u8 order;
2296 __le64 size;
2297 } __attribute__ ((packed)) size_buf = { 0 };
2298
2299 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2300 "rbd", "get_size",
2301 (char *) &snapid, sizeof (snapid),
2302 (char *) &size_buf, sizeof (size_buf),
2303 CEPH_OSD_FLAG_READ, NULL);
2304 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2305 if (ret < 0)
2306 return ret;
2307
2308 *order = size_buf.order;
2309 *snap_size = le64_to_cpu(size_buf.size);
2310
2311 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2312 (unsigned long long) snap_id, (unsigned int) *order,
2313 (unsigned long long) *snap_size);
2314
2315 return 0;
2316}
2317
2318static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2319{
2320 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2321 &rbd_dev->header.obj_order,
2322 &rbd_dev->header.image_size);
2323}
2324
1e130199
AE
2325static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2326{
2327 void *reply_buf;
2328 int ret;
2329 void *p;
2330
2331 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2332 if (!reply_buf)
2333 return -ENOMEM;
2334
2335 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2336 "rbd", "get_object_prefix",
2337 NULL, 0,
2338 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2339 CEPH_OSD_FLAG_READ, NULL);
2340 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2341 if (ret < 0)
2342 goto out;
a0ea3a40 2343 ret = 0; /* rbd_req_sync_exec() can return positive */
1e130199
AE
2344
2345 p = reply_buf;
2346 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2347 p + RBD_OBJ_PREFIX_LEN_MAX,
2348 NULL, GFP_NOIO);
2349
2350 if (IS_ERR(rbd_dev->header.object_prefix)) {
2351 ret = PTR_ERR(rbd_dev->header.object_prefix);
2352 rbd_dev->header.object_prefix = NULL;
2353 } else {
2354 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2355 }
2356
2357out:
2358 kfree(reply_buf);
2359
2360 return ret;
2361}
2362
b1b5402a
AE
2363static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2364 u64 *snap_features)
2365{
2366 __le64 snapid = cpu_to_le64(snap_id);
2367 struct {
2368 __le64 features;
2369 __le64 incompat;
2370 } features_buf = { 0 };
d889140c 2371 u64 incompat;
b1b5402a
AE
2372 int ret;
2373
2374 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2375 "rbd", "get_features",
2376 (char *) &snapid, sizeof (snapid),
2377 (char *) &features_buf, sizeof (features_buf),
2378 CEPH_OSD_FLAG_READ, NULL);
2379 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2380 if (ret < 0)
2381 return ret;
d889140c
AE
2382
2383 incompat = le64_to_cpu(features_buf.incompat);
2384 if (incompat & ~RBD_FEATURES_ALL)
2385 return -ENOTSUPP;
2386
b1b5402a
AE
2387 *snap_features = le64_to_cpu(features_buf.features);
2388
2389 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2390 (unsigned long long) snap_id,
2391 (unsigned long long) *snap_features,
2392 (unsigned long long) le64_to_cpu(features_buf.incompat));
2393
2394 return 0;
2395}
2396
2397static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2398{
2399 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2400 &rbd_dev->header.features);
2401}
2402
6e14b1a6 2403static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
35d489f9
AE
2404{
2405 size_t size;
2406 int ret;
2407 void *reply_buf;
2408 void *p;
2409 void *end;
2410 u64 seq;
2411 u32 snap_count;
2412 struct ceph_snap_context *snapc;
2413 u32 i;
2414
2415 /*
2416 * We'll need room for the seq value (maximum snapshot id),
2417 * snapshot count, and array of that many snapshot ids.
2418 * For now we have a fixed upper limit on the number we're
2419 * prepared to receive.
2420 */
2421 size = sizeof (__le64) + sizeof (__le32) +
2422 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2423 reply_buf = kzalloc(size, GFP_KERNEL);
2424 if (!reply_buf)
2425 return -ENOMEM;
2426
2427 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2428 "rbd", "get_snapcontext",
2429 NULL, 0,
2430 reply_buf, size,
6e14b1a6 2431 CEPH_OSD_FLAG_READ, ver);
35d489f9
AE
2432 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2433 if (ret < 0)
2434 goto out;
2435
2436 ret = -ERANGE;
2437 p = reply_buf;
2438 end = (char *) reply_buf + size;
2439 ceph_decode_64_safe(&p, end, seq, out);
2440 ceph_decode_32_safe(&p, end, snap_count, out);
2441
2442 /*
2443 * Make sure the reported number of snapshot ids wouldn't go
2444 * beyond the end of our buffer. But before checking that,
2445 * make sure the computed size of the snapshot context we
2446 * allocate is representable in a size_t.
2447 */
2448 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2449 / sizeof (u64)) {
2450 ret = -EINVAL;
2451 goto out;
2452 }
2453 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2454 goto out;
2455
2456 size = sizeof (struct ceph_snap_context) +
2457 snap_count * sizeof (snapc->snaps[0]);
2458 snapc = kmalloc(size, GFP_KERNEL);
2459 if (!snapc) {
2460 ret = -ENOMEM;
2461 goto out;
2462 }
2463
2464 atomic_set(&snapc->nref, 1);
2465 snapc->seq = seq;
2466 snapc->num_snaps = snap_count;
2467 for (i = 0; i < snap_count; i++)
2468 snapc->snaps[i] = ceph_decode_64(&p);
2469
2470 rbd_dev->header.snapc = snapc;
2471
2472 dout(" snap context seq = %llu, snap_count = %u\n",
2473 (unsigned long long) seq, (unsigned int) snap_count);
2474
2475out:
2476 kfree(reply_buf);
2477
2478 return 0;
2479}
2480
b8b1e2db
AE
2481static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2482{
2483 size_t size;
2484 void *reply_buf;
2485 __le64 snap_id;
2486 int ret;
2487 void *p;
2488 void *end;
b8b1e2db
AE
2489 char *snap_name;
2490
2491 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2492 reply_buf = kmalloc(size, GFP_KERNEL);
2493 if (!reply_buf)
2494 return ERR_PTR(-ENOMEM);
2495
2496 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2497 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2498 "rbd", "get_snapshot_name",
2499 (char *) &snap_id, sizeof (snap_id),
2500 reply_buf, size,
2501 CEPH_OSD_FLAG_READ, NULL);
2502 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2503 if (ret < 0)
2504 goto out;
2505
2506 p = reply_buf;
2507 end = (char *) reply_buf + size;
e5c35534 2508 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
b8b1e2db
AE
2509 if (IS_ERR(snap_name)) {
2510 ret = PTR_ERR(snap_name);
2511 goto out;
2512 } else {
2513 dout(" snap_id 0x%016llx snap_name = %s\n",
2514 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2515 }
2516 kfree(reply_buf);
2517
2518 return snap_name;
2519out:
2520 kfree(reply_buf);
2521
2522 return ERR_PTR(ret);
2523}
2524
2525static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2526 u64 *snap_size, u64 *snap_features)
2527{
2528 __le64 snap_id;
2529 u8 order;
2530 int ret;
2531
2532 snap_id = rbd_dev->header.snapc->snaps[which];
2533 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2534 if (ret)
2535 return ERR_PTR(ret);
2536 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2537 if (ret)
2538 return ERR_PTR(ret);
2539
2540 return rbd_dev_v2_snap_name(rbd_dev, which);
2541}
2542
2543static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2544 u64 *snap_size, u64 *snap_features)
2545{
2546 if (rbd_dev->image_format == 1)
2547 return rbd_dev_v1_snap_info(rbd_dev, which,
2548 snap_size, snap_features);
2549 if (rbd_dev->image_format == 2)
2550 return rbd_dev_v2_snap_info(rbd_dev, which,
2551 snap_size, snap_features);
2552 return ERR_PTR(-EINVAL);
2553}
2554
117973fb
AE
2555static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2556{
2557 int ret;
2558 __u8 obj_order;
2559
2560 down_write(&rbd_dev->header_rwsem);
2561
2562 /* Grab old order first, to see if it changes */
2563
2564 obj_order = rbd_dev->header.obj_order,
2565 ret = rbd_dev_v2_image_size(rbd_dev);
2566 if (ret)
2567 goto out;
2568 if (rbd_dev->header.obj_order != obj_order) {
2569 ret = -EIO;
2570 goto out;
2571 }
2572 rbd_update_mapping_size(rbd_dev);
2573
2574 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2575 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2576 if (ret)
2577 goto out;
2578 ret = rbd_dev_snaps_update(rbd_dev);
2579 dout("rbd_dev_snaps_update returned %d\n", ret);
2580 if (ret)
2581 goto out;
2582 ret = rbd_dev_snaps_register(rbd_dev);
2583 dout("rbd_dev_snaps_register returned %d\n", ret);
2584out:
2585 up_write(&rbd_dev->header_rwsem);
2586
2587 return ret;
2588}
2589
dfc5606d 2590/*
35938150
AE
2591 * Scan the rbd device's current snapshot list and compare it to the
2592 * newly-received snapshot context. Remove any existing snapshots
2593 * not present in the new snapshot context. Add a new snapshot for
2594 * any snaphots in the snapshot context not in the current list.
2595 * And verify there are no changes to snapshots we already know
2596 * about.
2597 *
2598 * Assumes the snapshots in the snapshot context are sorted by
2599 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2600 * are also maintained in that order.)
dfc5606d 2601 */
304f6808 2602static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
dfc5606d 2603{
35938150
AE
2604 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2605 const u32 snap_count = snapc->num_snaps;
35938150
AE
2606 struct list_head *head = &rbd_dev->snaps;
2607 struct list_head *links = head->next;
2608 u32 index = 0;
dfc5606d 2609
9fcbb800 2610 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
35938150
AE
2611 while (index < snap_count || links != head) {
2612 u64 snap_id;
2613 struct rbd_snap *snap;
cd892126
AE
2614 char *snap_name;
2615 u64 snap_size = 0;
2616 u64 snap_features = 0;
dfc5606d 2617
35938150
AE
2618 snap_id = index < snap_count ? snapc->snaps[index]
2619 : CEPH_NOSNAP;
2620 snap = links != head ? list_entry(links, struct rbd_snap, node)
2621 : NULL;
aafb230e 2622 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
dfc5606d 2623
35938150
AE
2624 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2625 struct list_head *next = links->next;
dfc5606d 2626
35938150 2627 /* Existing snapshot not in the new snap context */
dfc5606d 2628
0d7dbfce 2629 if (rbd_dev->spec->snap_id == snap->id)
daba5fdb 2630 rbd_dev->exists = false;
41f38c2b 2631 rbd_remove_snap_dev(snap);
9fcbb800 2632 dout("%ssnap id %llu has been removed\n",
0d7dbfce
AE
2633 rbd_dev->spec->snap_id == snap->id ?
2634 "mapped " : "",
9fcbb800 2635 (unsigned long long) snap->id);
35938150
AE
2636
2637 /* Done with this list entry; advance */
2638
2639 links = next;
dfc5606d
YS
2640 continue;
2641 }
35938150 2642
b8b1e2db
AE
2643 snap_name = rbd_dev_snap_info(rbd_dev, index,
2644 &snap_size, &snap_features);
cd892126
AE
2645 if (IS_ERR(snap_name))
2646 return PTR_ERR(snap_name);
2647
9fcbb800
AE
2648 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2649 (unsigned long long) snap_id);
35938150
AE
2650 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2651 struct rbd_snap *new_snap;
2652
2653 /* We haven't seen this snapshot before */
2654
c8d18425 2655 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
cd892126 2656 snap_id, snap_size, snap_features);
9fcbb800
AE
2657 if (IS_ERR(new_snap)) {
2658 int err = PTR_ERR(new_snap);
2659
2660 dout(" failed to add dev, error %d\n", err);
2661
2662 return err;
2663 }
35938150
AE
2664
2665 /* New goes before existing, or at end of list */
2666
9fcbb800 2667 dout(" added dev%s\n", snap ? "" : " at end\n");
35938150
AE
2668 if (snap)
2669 list_add_tail(&new_snap->node, &snap->node);
2670 else
523f3258 2671 list_add_tail(&new_snap->node, head);
35938150
AE
2672 } else {
2673 /* Already have this one */
2674
9fcbb800
AE
2675 dout(" already present\n");
2676
cd892126 2677 rbd_assert(snap->size == snap_size);
aafb230e 2678 rbd_assert(!strcmp(snap->name, snap_name));
cd892126 2679 rbd_assert(snap->features == snap_features);
35938150
AE
2680
2681 /* Done with this list entry; advance */
2682
2683 links = links->next;
dfc5606d 2684 }
35938150
AE
2685
2686 /* Advance to the next entry in the snapshot context */
2687
2688 index++;
dfc5606d 2689 }
9fcbb800 2690 dout("%s: done\n", __func__);
dfc5606d
YS
2691
2692 return 0;
2693}
2694
304f6808
AE
2695/*
2696 * Scan the list of snapshots and register the devices for any that
2697 * have not already been registered.
2698 */
2699static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2700{
2701 struct rbd_snap *snap;
2702 int ret = 0;
2703
2704 dout("%s called\n", __func__);
86ff77bb
AE
2705 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2706 return -EIO;
304f6808
AE
2707
2708 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2709 if (!rbd_snap_registered(snap)) {
2710 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2711 if (ret < 0)
2712 break;
2713 }
2714 }
2715 dout("%s: returning %d\n", __func__, ret);
2716
2717 return ret;
2718}
2719
dfc5606d
YS
2720static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2721{
dfc5606d 2722 struct device *dev;
cd789ab9 2723 int ret;
dfc5606d
YS
2724
2725 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
dfc5606d 2726
cd789ab9 2727 dev = &rbd_dev->dev;
dfc5606d
YS
2728 dev->bus = &rbd_bus_type;
2729 dev->type = &rbd_device_type;
2730 dev->parent = &rbd_root_dev;
2731 dev->release = rbd_dev_release;
de71a297 2732 dev_set_name(dev, "%d", rbd_dev->dev_id);
dfc5606d 2733 ret = device_register(dev);
dfc5606d 2734
dfc5606d 2735 mutex_unlock(&ctl_mutex);
cd789ab9 2736
dfc5606d 2737 return ret;
602adf40
YS
2738}
2739
dfc5606d
YS
2740static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2741{
2742 device_unregister(&rbd_dev->dev);
2743}
2744
59c2be1e
YS
2745static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2746{
2747 int ret, rc;
2748
2749 do {
0e6f322d 2750 ret = rbd_req_sync_watch(rbd_dev);
59c2be1e 2751 if (ret == -ERANGE) {
117973fb 2752 rc = rbd_dev_refresh(rbd_dev, NULL);
59c2be1e
YS
2753 if (rc < 0)
2754 return rc;
2755 }
2756 } while (ret == -ERANGE);
2757
2758 return ret;
2759}
2760
e2839308 2761static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
1ddbe94e
AE
2762
2763/*
499afd5b
AE
2764 * Get a unique rbd identifier for the given new rbd_dev, and add
2765 * the rbd_dev to the global list. The minimum rbd id is 1.
1ddbe94e 2766 */
e2839308 2767static void rbd_dev_id_get(struct rbd_device *rbd_dev)
b7f23c36 2768{
e2839308 2769 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
499afd5b
AE
2770
2771 spin_lock(&rbd_dev_list_lock);
2772 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2773 spin_unlock(&rbd_dev_list_lock);
e2839308
AE
2774 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2775 (unsigned long long) rbd_dev->dev_id);
1ddbe94e 2776}
b7f23c36 2777
1ddbe94e 2778/*
499afd5b
AE
2779 * Remove an rbd_dev from the global list, and record that its
2780 * identifier is no longer in use.
1ddbe94e 2781 */
e2839308 2782static void rbd_dev_id_put(struct rbd_device *rbd_dev)
1ddbe94e 2783{
d184f6bf 2784 struct list_head *tmp;
de71a297 2785 int rbd_id = rbd_dev->dev_id;
d184f6bf
AE
2786 int max_id;
2787
aafb230e 2788 rbd_assert(rbd_id > 0);
499afd5b 2789
e2839308
AE
2790 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2791 (unsigned long long) rbd_dev->dev_id);
499afd5b
AE
2792 spin_lock(&rbd_dev_list_lock);
2793 list_del_init(&rbd_dev->node);
d184f6bf
AE
2794
2795 /*
2796 * If the id being "put" is not the current maximum, there
2797 * is nothing special we need to do.
2798 */
e2839308 2799 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
d184f6bf
AE
2800 spin_unlock(&rbd_dev_list_lock);
2801 return;
2802 }
2803
2804 /*
2805 * We need to update the current maximum id. Search the
2806 * list to find out what it is. We're more likely to find
2807 * the maximum at the end, so search the list backward.
2808 */
2809 max_id = 0;
2810 list_for_each_prev(tmp, &rbd_dev_list) {
2811 struct rbd_device *rbd_dev;
2812
2813 rbd_dev = list_entry(tmp, struct rbd_device, node);
b213e0b1
AE
2814 if (rbd_dev->dev_id > max_id)
2815 max_id = rbd_dev->dev_id;
d184f6bf 2816 }
499afd5b 2817 spin_unlock(&rbd_dev_list_lock);
b7f23c36 2818
1ddbe94e 2819 /*
e2839308 2820 * The max id could have been updated by rbd_dev_id_get(), in
d184f6bf
AE
2821 * which case it now accurately reflects the new maximum.
2822 * Be careful not to overwrite the maximum value in that
2823 * case.
1ddbe94e 2824 */
e2839308
AE
2825 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2826 dout(" max dev id has been reset\n");
b7f23c36
AE
2827}
2828
e28fff26
AE
2829/*
2830 * Skips over white space at *buf, and updates *buf to point to the
2831 * first found non-space character (if any). Returns the length of
593a9e7b
AE
2832 * the token (string of non-white space characters) found. Note
2833 * that *buf must be terminated with '\0'.
e28fff26
AE
2834 */
2835static inline size_t next_token(const char **buf)
2836{
2837 /*
2838 * These are the characters that produce nonzero for
2839 * isspace() in the "C" and "POSIX" locales.
2840 */
2841 const char *spaces = " \f\n\r\t\v";
2842
2843 *buf += strspn(*buf, spaces); /* Find start of token */
2844
2845 return strcspn(*buf, spaces); /* Return token length */
2846}
2847
2848/*
2849 * Finds the next token in *buf, and if the provided token buffer is
2850 * big enough, copies the found token into it. The result, if
593a9e7b
AE
2851 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2852 * must be terminated with '\0' on entry.
e28fff26
AE
2853 *
2854 * Returns the length of the token found (not including the '\0').
2855 * Return value will be 0 if no token is found, and it will be >=
2856 * token_size if the token would not fit.
2857 *
593a9e7b 2858 * The *buf pointer will be updated to point beyond the end of the
e28fff26
AE
2859 * found token. Note that this occurs even if the token buffer is
2860 * too small to hold it.
2861 */
2862static inline size_t copy_token(const char **buf,
2863 char *token,
2864 size_t token_size)
2865{
2866 size_t len;
2867
2868 len = next_token(buf);
2869 if (len < token_size) {
2870 memcpy(token, *buf, len);
2871 *(token + len) = '\0';
2872 }
2873 *buf += len;
2874
2875 return len;
2876}
2877
ea3352f4
AE
2878/*
2879 * Finds the next token in *buf, dynamically allocates a buffer big
2880 * enough to hold a copy of it, and copies the token into the new
2881 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2882 * that a duplicate buffer is created even for a zero-length token.
2883 *
2884 * Returns a pointer to the newly-allocated duplicate, or a null
2885 * pointer if memory for the duplicate was not available. If
2886 * the lenp argument is a non-null pointer, the length of the token
2887 * (not including the '\0') is returned in *lenp.
2888 *
2889 * If successful, the *buf pointer will be updated to point beyond
2890 * the end of the found token.
2891 *
2892 * Note: uses GFP_KERNEL for allocation.
2893 */
2894static inline char *dup_token(const char **buf, size_t *lenp)
2895{
2896 char *dup;
2897 size_t len;
2898
2899 len = next_token(buf);
2900 dup = kmalloc(len + 1, GFP_KERNEL);
2901 if (!dup)
2902 return NULL;
2903
2904 memcpy(dup, *buf, len);
2905 *(dup + len) = '\0';
2906 *buf += len;
2907
2908 if (lenp)
2909 *lenp = len;
2910
2911 return dup;
2912}
2913
a725f65e 2914/*
859c31df
AE
2915 * Parse the options provided for an "rbd add" (i.e., rbd image
2916 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
2917 * and the data written is passed here via a NUL-terminated buffer.
2918 * Returns 0 if successful or an error code otherwise.
d22f76e7 2919 *
859c31df
AE
2920 * The information extracted from these options is recorded in
2921 * the other parameters which return dynamically-allocated
2922 * structures:
2923 * ceph_opts
2924 * The address of a pointer that will refer to a ceph options
2925 * structure. Caller must release the returned pointer using
2926 * ceph_destroy_options() when it is no longer needed.
2927 * rbd_opts
2928 * Address of an rbd options pointer. Fully initialized by
2929 * this function; caller must release with kfree().
2930 * spec
2931 * Address of an rbd image specification pointer. Fully
2932 * initialized by this function based on parsed options.
2933 * Caller must release with rbd_spec_put().
2934 *
2935 * The options passed take this form:
2936 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
2937 * where:
2938 * <mon_addrs>
2939 * A comma-separated list of one or more monitor addresses.
2940 * A monitor address is an ip address, optionally followed
2941 * by a port number (separated by a colon).
2942 * I.e.: ip1[:port1][,ip2[:port2]...]
2943 * <options>
2944 * A comma-separated list of ceph and/or rbd options.
2945 * <pool_name>
2946 * The name of the rados pool containing the rbd image.
2947 * <image_name>
2948 * The name of the image in that pool to map.
2949 * <snap_id>
2950 * An optional snapshot id. If provided, the mapping will
2951 * present data from the image at the time that snapshot was
2952 * created. The image head is used if no snapshot id is
2953 * provided. Snapshot mappings are always read-only.
a725f65e 2954 */
859c31df 2955static int rbd_add_parse_args(const char *buf,
dc79b113 2956 struct ceph_options **ceph_opts,
859c31df
AE
2957 struct rbd_options **opts,
2958 struct rbd_spec **rbd_spec)
e28fff26 2959{
d22f76e7 2960 size_t len;
859c31df 2961 char *options;
0ddebc0c
AE
2962 const char *mon_addrs;
2963 size_t mon_addrs_size;
859c31df 2964 struct rbd_spec *spec = NULL;
4e9afeba 2965 struct rbd_options *rbd_opts = NULL;
859c31df 2966 struct ceph_options *copts;
dc79b113 2967 int ret;
e28fff26
AE
2968
2969 /* The first four tokens are required */
2970
7ef3214a
AE
2971 len = next_token(&buf);
2972 if (!len)
dc79b113 2973 return -EINVAL; /* Missing monitor address(es) */
0ddebc0c 2974 mon_addrs = buf;
f28e565a 2975 mon_addrs_size = len + 1;
7ef3214a 2976 buf += len;
a725f65e 2977
dc79b113 2978 ret = -EINVAL;
f28e565a
AE
2979 options = dup_token(&buf, NULL);
2980 if (!options)
dc79b113 2981 return -ENOMEM;
f28e565a
AE
2982 if (!*options)
2983 goto out_err; /* Missing options */
e28fff26 2984
859c31df
AE
2985 spec = rbd_spec_alloc();
2986 if (!spec)
f28e565a 2987 goto out_mem;
859c31df
AE
2988
2989 spec->pool_name = dup_token(&buf, NULL);
2990 if (!spec->pool_name)
2991 goto out_mem;
2992 if (!*spec->pool_name)
f28e565a 2993 goto out_err; /* Missing pool name */
e28fff26 2994
859c31df
AE
2995 spec->image_name = dup_token(&buf, &spec->image_name_len);
2996 if (!spec->image_name)
f28e565a 2997 goto out_mem;
859c31df 2998 if (!*spec->image_name)
f28e565a 2999 goto out_err; /* Missing image name */
d4b125e9 3000
f28e565a
AE
3001 /*
3002 * Snapshot name is optional; default is to use "-"
3003 * (indicating the head/no snapshot).
3004 */
3feeb894 3005 len = next_token(&buf);
820a5f3e 3006 if (!len) {
3feeb894
AE
3007 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3008 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
f28e565a 3009 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
dc79b113 3010 ret = -ENAMETOOLONG;
f28e565a 3011 goto out_err;
849b4260 3012 }
859c31df
AE
3013 spec->snap_name = kmalloc(len + 1, GFP_KERNEL);
3014 if (!spec->snap_name)
f28e565a 3015 goto out_mem;
859c31df
AE
3016 memcpy(spec->snap_name, buf, len);
3017 *(spec->snap_name + len) = '\0';
e5c35534 3018
0ddebc0c 3019 /* Initialize all rbd options to the defaults */
e28fff26 3020
4e9afeba
AE
3021 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3022 if (!rbd_opts)
3023 goto out_mem;
3024
3025 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
d22f76e7 3026
859c31df 3027 copts = ceph_parse_options(options, mon_addrs,
0ddebc0c 3028 mon_addrs + mon_addrs_size - 1,
4e9afeba 3029 parse_rbd_opts_token, rbd_opts);
859c31df
AE
3030 if (IS_ERR(copts)) {
3031 ret = PTR_ERR(copts);
dc79b113
AE
3032 goto out_err;
3033 }
859c31df
AE
3034 kfree(options);
3035
3036 *ceph_opts = copts;
4e9afeba 3037 *opts = rbd_opts;
859c31df 3038 *rbd_spec = spec;
0ddebc0c 3039
dc79b113 3040 return 0;
f28e565a 3041out_mem:
dc79b113 3042 ret = -ENOMEM;
d22f76e7 3043out_err:
859c31df
AE
3044 kfree(rbd_opts);
3045 rbd_spec_put(spec);
f28e565a 3046 kfree(options);
d22f76e7 3047
dc79b113 3048 return ret;
a725f65e
AE
3049}
3050
589d30e0
AE
3051/*
3052 * An rbd format 2 image has a unique identifier, distinct from the
3053 * name given to it by the user. Internally, that identifier is
3054 * what's used to specify the names of objects related to the image.
3055 *
3056 * A special "rbd id" object is used to map an rbd image name to its
3057 * id. If that object doesn't exist, then there is no v2 rbd image
3058 * with the supplied name.
3059 *
3060 * This function will record the given rbd_dev's image_id field if
3061 * it can be determined, and in that case will return 0. If any
3062 * errors occur a negative errno will be returned and the rbd_dev's
3063 * image_id field will be unchanged (and should be NULL).
3064 */
3065static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3066{
3067 int ret;
3068 size_t size;
3069 char *object_name;
3070 void *response;
3071 void *p;
3072
2c0d0a10
AE
3073 /*
3074 * When probing a parent image, the image id is already
3075 * known (and the image name likely is not). There's no
3076 * need to fetch the image id again in this case.
3077 */
3078 if (rbd_dev->spec->image_id)
3079 return 0;
3080
589d30e0
AE
3081 /*
3082 * First, see if the format 2 image id file exists, and if
3083 * so, get the image's persistent id from it.
3084 */
0d7dbfce 3085 size = sizeof (RBD_ID_PREFIX) + rbd_dev->spec->image_name_len;
589d30e0
AE
3086 object_name = kmalloc(size, GFP_NOIO);
3087 if (!object_name)
3088 return -ENOMEM;
0d7dbfce 3089 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
589d30e0
AE
3090 dout("rbd id object name is %s\n", object_name);
3091
3092 /* Response will be an encoded string, which includes a length */
3093
3094 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3095 response = kzalloc(size, GFP_NOIO);
3096 if (!response) {
3097 ret = -ENOMEM;
3098 goto out;
3099 }
3100
3101 ret = rbd_req_sync_exec(rbd_dev, object_name,
3102 "rbd", "get_id",
3103 NULL, 0,
3104 response, RBD_IMAGE_ID_LEN_MAX,
3105 CEPH_OSD_FLAG_READ, NULL);
3106 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3107 if (ret < 0)
3108 goto out;
a0ea3a40 3109 ret = 0; /* rbd_req_sync_exec() can return positive */
589d30e0
AE
3110
3111 p = response;
0d7dbfce 3112 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
589d30e0 3113 p + RBD_IMAGE_ID_LEN_MAX,
0d7dbfce 3114 &rbd_dev->spec->image_id_len,
589d30e0 3115 GFP_NOIO);
0d7dbfce
AE
3116 if (IS_ERR(rbd_dev->spec->image_id)) {
3117 ret = PTR_ERR(rbd_dev->spec->image_id);
3118 rbd_dev->spec->image_id = NULL;
589d30e0 3119 } else {
0d7dbfce 3120 dout("image_id is %s\n", rbd_dev->spec->image_id);
589d30e0
AE
3121 }
3122out:
3123 kfree(response);
3124 kfree(object_name);
3125
3126 return ret;
3127}
3128
a30b71b9
AE
3129static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3130{
3131 int ret;
3132 size_t size;
3133
3134 /* Version 1 images have no id; empty string is used */
3135
0d7dbfce
AE
3136 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3137 if (!rbd_dev->spec->image_id)
a30b71b9 3138 return -ENOMEM;
0d7dbfce 3139 rbd_dev->spec->image_id_len = 0;
a30b71b9
AE
3140
3141 /* Record the header object name for this rbd image. */
3142
0d7dbfce 3143 size = rbd_dev->spec->image_name_len + sizeof (RBD_SUFFIX);
a30b71b9
AE
3144 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3145 if (!rbd_dev->header_name) {
3146 ret = -ENOMEM;
3147 goto out_err;
3148 }
0d7dbfce
AE
3149 sprintf(rbd_dev->header_name, "%s%s",
3150 rbd_dev->spec->image_name, RBD_SUFFIX);
a30b71b9
AE
3151
3152 /* Populate rbd image metadata */
3153
3154 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3155 if (ret < 0)
3156 goto out_err;
3157 rbd_dev->image_format = 1;
3158
3159 dout("discovered version 1 image, header name is %s\n",
3160 rbd_dev->header_name);
3161
3162 return 0;
3163
3164out_err:
3165 kfree(rbd_dev->header_name);
3166 rbd_dev->header_name = NULL;
0d7dbfce
AE
3167 kfree(rbd_dev->spec->image_id);
3168 rbd_dev->spec->image_id = NULL;
a30b71b9
AE
3169
3170 return ret;
3171}
3172
3173static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3174{
3175 size_t size;
9d475de5 3176 int ret;
6e14b1a6 3177 u64 ver = 0;
a30b71b9
AE
3178
3179 /*
3180 * Image id was filled in by the caller. Record the header
3181 * object name for this rbd image.
3182 */
0d7dbfce 3183 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->spec->image_id_len;
a30b71b9
AE
3184 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3185 if (!rbd_dev->header_name)
3186 return -ENOMEM;
3187 sprintf(rbd_dev->header_name, "%s%s",
0d7dbfce 3188 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
9d475de5
AE
3189
3190 /* Get the size and object order for the image */
3191
3192 ret = rbd_dev_v2_image_size(rbd_dev);
1e130199
AE
3193 if (ret < 0)
3194 goto out_err;
3195
3196 /* Get the object prefix (a.k.a. block_name) for the image */
3197
3198 ret = rbd_dev_v2_object_prefix(rbd_dev);
b1b5402a
AE
3199 if (ret < 0)
3200 goto out_err;
3201
d889140c 3202 /* Get the and check features for the image */
b1b5402a
AE
3203
3204 ret = rbd_dev_v2_features(rbd_dev);
9d475de5
AE
3205 if (ret < 0)
3206 goto out_err;
35d489f9 3207
6e14b1a6
AE
3208 /* crypto and compression type aren't (yet) supported for v2 images */
3209
3210 rbd_dev->header.crypt_type = 0;
3211 rbd_dev->header.comp_type = 0;
35d489f9 3212
6e14b1a6
AE
3213 /* Get the snapshot context, plus the header version */
3214
3215 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
35d489f9
AE
3216 if (ret)
3217 goto out_err;
6e14b1a6
AE
3218 rbd_dev->header.obj_version = ver;
3219
a30b71b9
AE
3220 rbd_dev->image_format = 2;
3221
3222 dout("discovered version 2 image, header name is %s\n",
3223 rbd_dev->header_name);
3224
35152979 3225 return 0;
9d475de5
AE
3226out_err:
3227 kfree(rbd_dev->header_name);
3228 rbd_dev->header_name = NULL;
1e130199
AE
3229 kfree(rbd_dev->header.object_prefix);
3230 rbd_dev->header.object_prefix = NULL;
9d475de5
AE
3231
3232 return ret;
a30b71b9
AE
3233}
3234
83a06263
AE
3235static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3236{
3237 int ret;
3238
3239 /* no need to lock here, as rbd_dev is not registered yet */
3240 ret = rbd_dev_snaps_update(rbd_dev);
3241 if (ret)
3242 return ret;
3243
3244 ret = rbd_dev_set_mapping(rbd_dev);
3245 if (ret)
3246 goto err_out_snaps;
3247
3248 /* generate unique id: find highest unique id, add one */
3249 rbd_dev_id_get(rbd_dev);
3250
3251 /* Fill in the device name, now that we have its id. */
3252 BUILD_BUG_ON(DEV_NAME_LEN
3253 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3254 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3255
3256 /* Get our block major device number. */
3257
3258 ret = register_blkdev(0, rbd_dev->name);
3259 if (ret < 0)
3260 goto err_out_id;
3261 rbd_dev->major = ret;
3262
3263 /* Set up the blkdev mapping. */
3264
3265 ret = rbd_init_disk(rbd_dev);
3266 if (ret)
3267 goto err_out_blkdev;
3268
3269 ret = rbd_bus_add_dev(rbd_dev);
3270 if (ret)
3271 goto err_out_disk;
3272
3273 /*
3274 * At this point cleanup in the event of an error is the job
3275 * of the sysfs code (initiated by rbd_bus_del_dev()).
3276 */
3277 down_write(&rbd_dev->header_rwsem);
3278 ret = rbd_dev_snaps_register(rbd_dev);
3279 up_write(&rbd_dev->header_rwsem);
3280 if (ret)
3281 goto err_out_bus;
3282
3283 ret = rbd_init_watch_dev(rbd_dev);
3284 if (ret)
3285 goto err_out_bus;
3286
3287 /* Everything's ready. Announce the disk to the world. */
3288
3289 add_disk(rbd_dev->disk);
3290
3291 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3292 (unsigned long long) rbd_dev->mapping.size);
3293
3294 return ret;
3295err_out_bus:
3296 /* this will also clean up rest of rbd_dev stuff */
3297
3298 rbd_bus_del_dev(rbd_dev);
3299
3300 return ret;
3301err_out_disk:
3302 rbd_free_disk(rbd_dev);
3303err_out_blkdev:
3304 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3305err_out_id:
3306 rbd_dev_id_put(rbd_dev);
3307err_out_snaps:
3308 rbd_remove_all_snaps(rbd_dev);
3309
3310 return ret;
3311}
3312
a30b71b9
AE
3313/*
3314 * Probe for the existence of the header object for the given rbd
3315 * device. For format 2 images this includes determining the image
3316 * id.
3317 */
3318static int rbd_dev_probe(struct rbd_device *rbd_dev)
3319{
3320 int ret;
3321
3322 /*
3323 * Get the id from the image id object. If it's not a
3324 * format 2 image, we'll get ENOENT back, and we'll assume
3325 * it's a format 1 image.
3326 */
3327 ret = rbd_dev_image_id(rbd_dev);
3328 if (ret)
3329 ret = rbd_dev_v1_probe(rbd_dev);
3330 else
3331 ret = rbd_dev_v2_probe(rbd_dev);
83a06263 3332 if (ret) {
a30b71b9
AE
3333 dout("probe failed, returning %d\n", ret);
3334
83a06263
AE
3335 return ret;
3336 }
3337
3338 ret = rbd_dev_probe_finish(rbd_dev);
3339 if (ret)
3340 rbd_header_free(&rbd_dev->header);
3341
a30b71b9
AE
3342 return ret;
3343}
3344
59c2be1e
YS
3345static ssize_t rbd_add(struct bus_type *bus,
3346 const char *buf,
3347 size_t count)
602adf40 3348{
cb8627c7 3349 struct rbd_device *rbd_dev = NULL;
dc79b113 3350 struct ceph_options *ceph_opts = NULL;
4e9afeba 3351 struct rbd_options *rbd_opts = NULL;
859c31df 3352 struct rbd_spec *spec = NULL;
9d3997fd 3353 struct rbd_client *rbdc;
27cc2594
AE
3354 struct ceph_osd_client *osdc;
3355 int rc = -ENOMEM;
602adf40
YS
3356
3357 if (!try_module_get(THIS_MODULE))
3358 return -ENODEV;
3359
602adf40 3360 /* parse add command */
859c31df 3361 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
dc79b113 3362 if (rc < 0)
bd4ba655 3363 goto err_out_module;
78cea76e 3364
9d3997fd
AE
3365 rbdc = rbd_get_client(ceph_opts);
3366 if (IS_ERR(rbdc)) {
3367 rc = PTR_ERR(rbdc);
0ddebc0c 3368 goto err_out_args;
9d3997fd 3369 }
c53d5893 3370 ceph_opts = NULL; /* rbd_dev client now owns this */
602adf40 3371
602adf40 3372 /* pick the pool */
9d3997fd 3373 osdc = &rbdc->client->osdc;
859c31df 3374 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
602adf40
YS
3375 if (rc < 0)
3376 goto err_out_client;
859c31df
AE
3377 spec->pool_id = (u64) rc;
3378
c53d5893 3379 rbd_dev = rbd_dev_create(rbdc, spec);
bd4ba655
AE
3380 if (!rbd_dev)
3381 goto err_out_client;
c53d5893
AE
3382 rbdc = NULL; /* rbd_dev now owns this */
3383 spec = NULL; /* rbd_dev now owns this */
602adf40 3384
bd4ba655 3385 rbd_dev->mapping.read_only = rbd_opts->read_only;
c53d5893
AE
3386 kfree(rbd_opts);
3387 rbd_opts = NULL; /* done with this */
bd4ba655 3388
a30b71b9
AE
3389 rc = rbd_dev_probe(rbd_dev);
3390 if (rc < 0)
c53d5893 3391 goto err_out_rbd_dev;
05fd6f6f 3392
602adf40 3393 return count;
c53d5893
AE
3394err_out_rbd_dev:
3395 rbd_dev_destroy(rbd_dev);
bd4ba655 3396err_out_client:
9d3997fd 3397 rbd_put_client(rbdc);
0ddebc0c 3398err_out_args:
78cea76e
AE
3399 if (ceph_opts)
3400 ceph_destroy_options(ceph_opts);
4e9afeba 3401 kfree(rbd_opts);
859c31df 3402 rbd_spec_put(spec);
bd4ba655
AE
3403err_out_module:
3404 module_put(THIS_MODULE);
27cc2594 3405
602adf40 3406 dout("Error adding device %s\n", buf);
27cc2594
AE
3407
3408 return (ssize_t) rc;
602adf40
YS
3409}
3410
de71a297 3411static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
602adf40
YS
3412{
3413 struct list_head *tmp;
3414 struct rbd_device *rbd_dev;
3415
e124a82f 3416 spin_lock(&rbd_dev_list_lock);
602adf40
YS
3417 list_for_each(tmp, &rbd_dev_list) {
3418 rbd_dev = list_entry(tmp, struct rbd_device, node);
de71a297 3419 if (rbd_dev->dev_id == dev_id) {
e124a82f 3420 spin_unlock(&rbd_dev_list_lock);
602adf40 3421 return rbd_dev;
e124a82f 3422 }
602adf40 3423 }
e124a82f 3424 spin_unlock(&rbd_dev_list_lock);
602adf40
YS
3425 return NULL;
3426}
3427
dfc5606d 3428static void rbd_dev_release(struct device *dev)
602adf40 3429{
593a9e7b 3430 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 3431
1dbb4399
AE
3432 if (rbd_dev->watch_request) {
3433 struct ceph_client *client = rbd_dev->rbd_client->client;
3434
3435 ceph_osdc_unregister_linger_request(&client->osdc,
59c2be1e 3436 rbd_dev->watch_request);
1dbb4399 3437 }
59c2be1e 3438 if (rbd_dev->watch_event)
070c633f 3439 rbd_req_sync_unwatch(rbd_dev);
59c2be1e 3440
602adf40
YS
3441
3442 /* clean up and free blkdev */
3443 rbd_free_disk(rbd_dev);
3444 unregister_blkdev(rbd_dev->major, rbd_dev->name);
32eec68d 3445
2ac4e75d
AE
3446 /* release allocated disk header fields */
3447 rbd_header_free(&rbd_dev->header);
3448
32eec68d 3449 /* done with the id, and with the rbd_dev */
e2839308 3450 rbd_dev_id_put(rbd_dev);
c53d5893
AE
3451 rbd_assert(rbd_dev->rbd_client != NULL);
3452 rbd_dev_destroy(rbd_dev);
602adf40
YS
3453
3454 /* release module ref */
3455 module_put(THIS_MODULE);
602adf40
YS
3456}
3457
dfc5606d
YS
3458static ssize_t rbd_remove(struct bus_type *bus,
3459 const char *buf,
3460 size_t count)
602adf40
YS
3461{
3462 struct rbd_device *rbd_dev = NULL;
3463 int target_id, rc;
3464 unsigned long ul;
3465 int ret = count;
3466
3467 rc = strict_strtoul(buf, 10, &ul);
3468 if (rc)
3469 return rc;
3470
3471 /* convert to int; abort if we lost anything in the conversion */
3472 target_id = (int) ul;
3473 if (target_id != ul)
3474 return -EINVAL;
3475
3476 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3477
3478 rbd_dev = __rbd_get_dev(target_id);
3479 if (!rbd_dev) {
3480 ret = -ENOENT;
3481 goto done;
3482 }
3483
41f38c2b 3484 rbd_remove_all_snaps(rbd_dev);
dfc5606d 3485 rbd_bus_del_dev(rbd_dev);
602adf40
YS
3486
3487done:
3488 mutex_unlock(&ctl_mutex);
aafb230e 3489
602adf40
YS
3490 return ret;
3491}
3492
602adf40
YS
3493/*
3494 * create control files in sysfs
dfc5606d 3495 * /sys/bus/rbd/...
602adf40
YS
3496 */
3497static int rbd_sysfs_init(void)
3498{
dfc5606d 3499 int ret;
602adf40 3500
fed4c143 3501 ret = device_register(&rbd_root_dev);
21079786 3502 if (ret < 0)
dfc5606d 3503 return ret;
602adf40 3504
fed4c143
AE
3505 ret = bus_register(&rbd_bus_type);
3506 if (ret < 0)
3507 device_unregister(&rbd_root_dev);
602adf40 3508
602adf40
YS
3509 return ret;
3510}
3511
3512static void rbd_sysfs_cleanup(void)
3513{
dfc5606d 3514 bus_unregister(&rbd_bus_type);
fed4c143 3515 device_unregister(&rbd_root_dev);
602adf40
YS
3516}
3517
3518int __init rbd_init(void)
3519{
3520 int rc;
3521
3522 rc = rbd_sysfs_init();
3523 if (rc)
3524 return rc;
f0f8cef5 3525 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
602adf40
YS
3526 return 0;
3527}
3528
3529void __exit rbd_exit(void)
3530{
3531 rbd_sysfs_cleanup();
3532}
3533
3534module_init(rbd_init);
3535module_exit(rbd_exit);
3536
3537MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3538MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3539MODULE_DESCRIPTION("rados block device");
3540
3541/* following authorship retained from original osdblk.c */
3542MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3543
3544MODULE_LICENSE("GPL");