]> git.ipfire.org Git - people/arne_f/kernel.git/blame - drivers/block/rbd.c
rbd: don't zero-fill non-image object requests
[people/arne_f/kernel.git] / drivers / block / rbd.c
CommitLineData
602adf40
YS
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
dfc5606d 24 For usage instructions, please refer to:
602adf40 25
dfc5606d 26 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
59c2be1e 34#include <linux/parser.h>
602adf40
YS
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
aafb230e
AE
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
593a9e7b
AE
46/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
2647ba38 55/* It might be useful to have these defined elsewhere */
df111be6 56
2647ba38
AE
57#define U8_MAX ((u8) (~0U))
58#define U16_MAX ((u16) (~0U))
59#define U32_MAX ((u32) (~0U))
60#define U64_MAX ((u64) (~0ULL))
df111be6 61
f0f8cef5
AE
62#define RBD_DRV_NAME "rbd"
63#define RBD_DRV_NAME_LONG "rbd (rados block device)"
602adf40
YS
64
65#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
66
d4b125e9
AE
67#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
68#define RBD_MAX_SNAP_NAME_LEN \
69 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
70
35d489f9 71#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
602adf40
YS
72
73#define RBD_SNAP_HEAD_NAME "-"
74
9e15b77d
AE
75/* This allows a single page to hold an image name sent by OSD */
76#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
1e130199 77#define RBD_IMAGE_ID_LEN_MAX 64
9e15b77d 78
1e130199 79#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 80
d889140c
AE
81/* Feature bits */
82
83#define RBD_FEATURE_LAYERING 1
84
85/* Features supported by this (client software) implementation. */
86
87#define RBD_FEATURES_ALL (0)
88
81a89793
AE
89/*
90 * An RBD device name will be "rbd#", where the "rbd" comes from
91 * RBD_DRV_NAME above, and # is a unique integer identifier.
92 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
93 * enough to hold all possible device names.
94 */
602adf40 95#define DEV_NAME_LEN 32
81a89793 96#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
602adf40
YS
97
98/*
99 * block device image metadata (in-memory version)
100 */
101struct rbd_image_header {
f84344f3 102 /* These four fields never change for a given rbd image */
849b4260 103 char *object_prefix;
34b13184 104 u64 features;
602adf40
YS
105 __u8 obj_order;
106 __u8 crypt_type;
107 __u8 comp_type;
602adf40 108
f84344f3
AE
109 /* The remaining fields need to be updated occasionally */
110 u64 image_size;
111 struct ceph_snap_context *snapc;
602adf40
YS
112 char *snap_names;
113 u64 *snap_sizes;
59c2be1e
YS
114
115 u64 obj_version;
116};
117
0d7dbfce
AE
118/*
119 * An rbd image specification.
120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
c66c6e0c
AE
122 * identify an image. Each rbd_dev structure includes a pointer to
123 * an rbd_spec structure that encapsulates this identity.
124 *
125 * Each of the id's in an rbd_spec has an associated name. For a
126 * user-mapped image, the names are supplied and the id's associated
127 * with them are looked up. For a layered image, a parent image is
128 * defined by the tuple, and the names are looked up.
129 *
130 * An rbd_dev structure contains a parent_spec pointer which is
131 * non-null if the image it represents is a child in a layered
132 * image. This pointer will refer to the rbd_spec structure used
133 * by the parent rbd_dev for its own identity (i.e., the structure
134 * is shared between the parent and child).
135 *
136 * Since these structures are populated once, during the discovery
137 * phase of image construction, they are effectively immutable so
138 * we make no effort to synchronize access to them.
139 *
140 * Note that code herein does not assume the image name is known (it
141 * could be a null pointer).
0d7dbfce
AE
142 */
143struct rbd_spec {
144 u64 pool_id;
145 char *pool_name;
146
147 char *image_id;
0d7dbfce 148 char *image_name;
0d7dbfce
AE
149
150 u64 snap_id;
151 char *snap_name;
152
153 struct kref kref;
154};
155
602adf40 156/*
f0f8cef5 157 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
158 */
159struct rbd_client {
160 struct ceph_client *client;
161 struct kref kref;
162 struct list_head node;
163};
164
bf0d5f50
AE
165struct rbd_img_request;
166typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
167
168#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
169
170struct rbd_obj_request;
171typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
172
9969ebc5
AE
173enum obj_request_type {
174 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
175};
bf0d5f50
AE
176
177struct rbd_obj_request {
178 const char *object_name;
179 u64 offset; /* object start byte */
180 u64 length; /* bytes from offset */
181
182 struct rbd_img_request *img_request;
183 struct list_head links; /* img_request->obj_requests */
184 u32 which; /* posn image request list */
185
186 enum obj_request_type type;
788e2df3
AE
187 union {
188 struct bio *bio_list;
189 struct {
190 struct page **pages;
191 u32 page_count;
192 };
193 };
bf0d5f50
AE
194
195 struct ceph_osd_request *osd_req;
196
197 u64 xferred; /* bytes transferred */
198 u64 version;
1b83bef2 199 int result;
bf0d5f50
AE
200 atomic_t done;
201
202 rbd_obj_callback_t callback;
788e2df3 203 struct completion completion;
bf0d5f50
AE
204
205 struct kref kref;
206};
207
208struct rbd_img_request {
209 struct request *rq;
210 struct rbd_device *rbd_dev;
211 u64 offset; /* starting image byte offset */
212 u64 length; /* byte count from offset */
213 bool write_request; /* false for read */
214 union {
215 struct ceph_snap_context *snapc; /* for writes */
216 u64 snap_id; /* for reads */
217 };
218 spinlock_t completion_lock;/* protects next_completion */
219 u32 next_completion;
220 rbd_img_callback_t callback;
221
222 u32 obj_request_count;
223 struct list_head obj_requests; /* rbd_obj_request structs */
224
225 struct kref kref;
226};
227
228#define for_each_obj_request(ireq, oreq) \
ef06f4d3 229 list_for_each_entry(oreq, &(ireq)->obj_requests, links)
bf0d5f50 230#define for_each_obj_request_from(ireq, oreq) \
ef06f4d3 231 list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
bf0d5f50 232#define for_each_obj_request_safe(ireq, oreq, n) \
ef06f4d3 233 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
bf0d5f50 234
dfc5606d
YS
235struct rbd_snap {
236 struct device dev;
237 const char *name;
3591538f 238 u64 size;
dfc5606d
YS
239 struct list_head node;
240 u64 id;
34b13184 241 u64 features;
dfc5606d
YS
242};
243
f84344f3 244struct rbd_mapping {
99c1f08f 245 u64 size;
34b13184 246 u64 features;
f84344f3
AE
247 bool read_only;
248};
249
602adf40
YS
250/*
251 * a single device
252 */
253struct rbd_device {
de71a297 254 int dev_id; /* blkdev unique id */
602adf40
YS
255
256 int major; /* blkdev assigned major */
257 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 258
a30b71b9 259 u32 image_format; /* Either 1 or 2 */
602adf40
YS
260 struct rbd_client *rbd_client;
261
262 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
263
b82d167b 264 spinlock_t lock; /* queue, flags, open_count */
602adf40
YS
265
266 struct rbd_image_header header;
b82d167b 267 unsigned long flags; /* possibly lock protected */
0d7dbfce 268 struct rbd_spec *spec;
602adf40 269
0d7dbfce 270 char *header_name;
971f839a 271
0903e875
AE
272 struct ceph_file_layout layout;
273
59c2be1e 274 struct ceph_osd_event *watch_event;
975241af 275 struct rbd_obj_request *watch_request;
59c2be1e 276
86b00e0d
AE
277 struct rbd_spec *parent_spec;
278 u64 parent_overlap;
279
c666601a
JD
280 /* protects updating the header */
281 struct rw_semaphore header_rwsem;
f84344f3
AE
282
283 struct rbd_mapping mapping;
602adf40
YS
284
285 struct list_head node;
dfc5606d
YS
286
287 /* list of snapshots */
288 struct list_head snaps;
289
290 /* sysfs related */
291 struct device dev;
b82d167b 292 unsigned long open_count; /* protected by lock */
dfc5606d
YS
293};
294
b82d167b
AE
295/*
296 * Flag bits for rbd_dev->flags. If atomicity is required,
297 * rbd_dev->lock is used to protect access.
298 *
299 * Currently, only the "removing" flag (which is coupled with the
300 * "open_count" field) requires atomic access.
301 */
6d292906
AE
302enum rbd_dev_flags {
303 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
b82d167b 304 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
6d292906
AE
305};
306
602adf40 307static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
e124a82f 308
602adf40 309static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
310static DEFINE_SPINLOCK(rbd_dev_list_lock);
311
432b8587
AE
312static LIST_HEAD(rbd_client_list); /* clients */
313static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 314
304f6808
AE
315static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
316static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
317
dfc5606d 318static void rbd_dev_release(struct device *dev);
41f38c2b 319static void rbd_remove_snap_dev(struct rbd_snap *snap);
dfc5606d 320
f0f8cef5
AE
321static ssize_t rbd_add(struct bus_type *bus, const char *buf,
322 size_t count);
323static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
324 size_t count);
325
326static struct bus_attribute rbd_bus_attrs[] = {
327 __ATTR(add, S_IWUSR, NULL, rbd_add),
328 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
329 __ATTR_NULL
330};
331
332static struct bus_type rbd_bus_type = {
333 .name = "rbd",
334 .bus_attrs = rbd_bus_attrs,
335};
336
337static void rbd_root_dev_release(struct device *dev)
338{
339}
340
341static struct device rbd_root_dev = {
342 .init_name = "rbd",
343 .release = rbd_root_dev_release,
344};
345
06ecc6cb
AE
346static __printf(2, 3)
347void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
348{
349 struct va_format vaf;
350 va_list args;
351
352 va_start(args, fmt);
353 vaf.fmt = fmt;
354 vaf.va = &args;
355
356 if (!rbd_dev)
357 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
358 else if (rbd_dev->disk)
359 printk(KERN_WARNING "%s: %s: %pV\n",
360 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
361 else if (rbd_dev->spec && rbd_dev->spec->image_name)
362 printk(KERN_WARNING "%s: image %s: %pV\n",
363 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
364 else if (rbd_dev->spec && rbd_dev->spec->image_id)
365 printk(KERN_WARNING "%s: id %s: %pV\n",
366 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
367 else /* punt */
368 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
369 RBD_DRV_NAME, rbd_dev, &vaf);
370 va_end(args);
371}
372
aafb230e
AE
373#ifdef RBD_DEBUG
374#define rbd_assert(expr) \
375 if (unlikely(!(expr))) { \
376 printk(KERN_ERR "\nAssertion failure in %s() " \
377 "at line %d:\n\n" \
378 "\trbd_assert(%s);\n\n", \
379 __func__, __LINE__, #expr); \
380 BUG(); \
381 }
382#else /* !RBD_DEBUG */
383# define rbd_assert(expr) ((void) 0)
384#endif /* !RBD_DEBUG */
dfc5606d 385
117973fb
AE
386static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
387static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
59c2be1e 388
602adf40
YS
389static int rbd_open(struct block_device *bdev, fmode_t mode)
390{
f0f8cef5 391 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
b82d167b 392 bool removing = false;
602adf40 393
f84344f3 394 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
602adf40
YS
395 return -EROFS;
396
a14ea269 397 spin_lock_irq(&rbd_dev->lock);
b82d167b
AE
398 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
399 removing = true;
400 else
401 rbd_dev->open_count++;
a14ea269 402 spin_unlock_irq(&rbd_dev->lock);
b82d167b
AE
403 if (removing)
404 return -ENOENT;
405
42382b70 406 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
c3e946ce 407 (void) get_device(&rbd_dev->dev);
f84344f3 408 set_device_ro(bdev, rbd_dev->mapping.read_only);
42382b70 409 mutex_unlock(&ctl_mutex);
340c7a2b 410
602adf40
YS
411 return 0;
412}
413
dfc5606d
YS
414static int rbd_release(struct gendisk *disk, fmode_t mode)
415{
416 struct rbd_device *rbd_dev = disk->private_data;
b82d167b
AE
417 unsigned long open_count_before;
418
a14ea269 419 spin_lock_irq(&rbd_dev->lock);
b82d167b 420 open_count_before = rbd_dev->open_count--;
a14ea269 421 spin_unlock_irq(&rbd_dev->lock);
b82d167b 422 rbd_assert(open_count_before > 0);
dfc5606d 423
42382b70 424 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
c3e946ce 425 put_device(&rbd_dev->dev);
42382b70 426 mutex_unlock(&ctl_mutex);
dfc5606d
YS
427
428 return 0;
429}
430
602adf40
YS
431static const struct block_device_operations rbd_bd_ops = {
432 .owner = THIS_MODULE,
433 .open = rbd_open,
dfc5606d 434 .release = rbd_release,
602adf40
YS
435};
436
437/*
438 * Initialize an rbd client instance.
43ae4701 439 * We own *ceph_opts.
602adf40 440 */
f8c38929 441static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
442{
443 struct rbd_client *rbdc;
444 int ret = -ENOMEM;
445
37206ee5 446 dout("%s:\n", __func__);
602adf40
YS
447 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
448 if (!rbdc)
449 goto out_opt;
450
451 kref_init(&rbdc->kref);
452 INIT_LIST_HEAD(&rbdc->node);
453
bc534d86
AE
454 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
455
43ae4701 456 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
602adf40 457 if (IS_ERR(rbdc->client))
bc534d86 458 goto out_mutex;
43ae4701 459 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
460
461 ret = ceph_open_session(rbdc->client);
462 if (ret < 0)
463 goto out_err;
464
432b8587 465 spin_lock(&rbd_client_list_lock);
602adf40 466 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 467 spin_unlock(&rbd_client_list_lock);
602adf40 468
bc534d86 469 mutex_unlock(&ctl_mutex);
37206ee5 470 dout("%s: rbdc %p\n", __func__, rbdc);
bc534d86 471
602adf40
YS
472 return rbdc;
473
474out_err:
475 ceph_destroy_client(rbdc->client);
bc534d86
AE
476out_mutex:
477 mutex_unlock(&ctl_mutex);
602adf40
YS
478 kfree(rbdc);
479out_opt:
43ae4701
AE
480 if (ceph_opts)
481 ceph_destroy_options(ceph_opts);
37206ee5
AE
482 dout("%s: error %d\n", __func__, ret);
483
28f259b7 484 return ERR_PTR(ret);
602adf40
YS
485}
486
487/*
1f7ba331
AE
488 * Find a ceph client with specific addr and configuration. If
489 * found, bump its reference count.
602adf40 490 */
1f7ba331 491static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
492{
493 struct rbd_client *client_node;
1f7ba331 494 bool found = false;
602adf40 495
43ae4701 496 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
497 return NULL;
498
1f7ba331
AE
499 spin_lock(&rbd_client_list_lock);
500 list_for_each_entry(client_node, &rbd_client_list, node) {
501 if (!ceph_compare_options(ceph_opts, client_node->client)) {
502 kref_get(&client_node->kref);
503 found = true;
504 break;
505 }
506 }
507 spin_unlock(&rbd_client_list_lock);
508
509 return found ? client_node : NULL;
602adf40
YS
510}
511
59c2be1e
YS
512/*
513 * mount options
514 */
515enum {
59c2be1e
YS
516 Opt_last_int,
517 /* int args above */
518 Opt_last_string,
519 /* string args above */
cc0538b6
AE
520 Opt_read_only,
521 Opt_read_write,
522 /* Boolean args above */
523 Opt_last_bool,
59c2be1e
YS
524};
525
43ae4701 526static match_table_t rbd_opts_tokens = {
59c2be1e
YS
527 /* int args above */
528 /* string args above */
be466c1c 529 {Opt_read_only, "read_only"},
cc0538b6
AE
530 {Opt_read_only, "ro"}, /* Alternate spelling */
531 {Opt_read_write, "read_write"},
532 {Opt_read_write, "rw"}, /* Alternate spelling */
533 /* Boolean args above */
59c2be1e
YS
534 {-1, NULL}
535};
536
98571b5a
AE
537struct rbd_options {
538 bool read_only;
539};
540
541#define RBD_READ_ONLY_DEFAULT false
542
59c2be1e
YS
543static int parse_rbd_opts_token(char *c, void *private)
544{
43ae4701 545 struct rbd_options *rbd_opts = private;
59c2be1e
YS
546 substring_t argstr[MAX_OPT_ARGS];
547 int token, intval, ret;
548
43ae4701 549 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
550 if (token < 0)
551 return -EINVAL;
552
553 if (token < Opt_last_int) {
554 ret = match_int(&argstr[0], &intval);
555 if (ret < 0) {
556 pr_err("bad mount option arg (not int) "
557 "at '%s'\n", c);
558 return ret;
559 }
560 dout("got int token %d val %d\n", token, intval);
561 } else if (token > Opt_last_int && token < Opt_last_string) {
562 dout("got string token %d val %s\n", token,
563 argstr[0].from);
cc0538b6
AE
564 } else if (token > Opt_last_string && token < Opt_last_bool) {
565 dout("got Boolean token %d\n", token);
59c2be1e
YS
566 } else {
567 dout("got token %d\n", token);
568 }
569
570 switch (token) {
cc0538b6
AE
571 case Opt_read_only:
572 rbd_opts->read_only = true;
573 break;
574 case Opt_read_write:
575 rbd_opts->read_only = false;
576 break;
59c2be1e 577 default:
aafb230e
AE
578 rbd_assert(false);
579 break;
59c2be1e
YS
580 }
581 return 0;
582}
583
602adf40
YS
584/*
585 * Get a ceph client with specific addr and configuration, if one does
586 * not exist create it.
587 */
9d3997fd 588static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
602adf40 589{
f8c38929 590 struct rbd_client *rbdc;
59c2be1e 591
1f7ba331 592 rbdc = rbd_client_find(ceph_opts);
9d3997fd 593 if (rbdc) /* using an existing client */
43ae4701 594 ceph_destroy_options(ceph_opts);
9d3997fd 595 else
f8c38929 596 rbdc = rbd_client_create(ceph_opts);
602adf40 597
9d3997fd 598 return rbdc;
602adf40
YS
599}
600
601/*
602 * Destroy ceph client
d23a4b3f 603 *
432b8587 604 * Caller must hold rbd_client_list_lock.
602adf40
YS
605 */
606static void rbd_client_release(struct kref *kref)
607{
608 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
609
37206ee5 610 dout("%s: rbdc %p\n", __func__, rbdc);
cd9d9f5d 611 spin_lock(&rbd_client_list_lock);
602adf40 612 list_del(&rbdc->node);
cd9d9f5d 613 spin_unlock(&rbd_client_list_lock);
602adf40
YS
614
615 ceph_destroy_client(rbdc->client);
616 kfree(rbdc);
617}
618
619/*
620 * Drop reference to ceph client node. If it's not referenced anymore, release
621 * it.
622 */
9d3997fd 623static void rbd_put_client(struct rbd_client *rbdc)
602adf40 624{
c53d5893
AE
625 if (rbdc)
626 kref_put(&rbdc->kref, rbd_client_release);
602adf40
YS
627}
628
a30b71b9
AE
629static bool rbd_image_format_valid(u32 image_format)
630{
631 return image_format == 1 || image_format == 2;
632}
633
8e94af8e
AE
634static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
635{
103a150f
AE
636 size_t size;
637 u32 snap_count;
638
639 /* The header has to start with the magic rbd header text */
640 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
641 return false;
642
db2388b6
AE
643 /* The bio layer requires at least sector-sized I/O */
644
645 if (ondisk->options.order < SECTOR_SHIFT)
646 return false;
647
648 /* If we use u64 in a few spots we may be able to loosen this */
649
650 if (ondisk->options.order > 8 * sizeof (int) - 1)
651 return false;
652
103a150f
AE
653 /*
654 * The size of a snapshot header has to fit in a size_t, and
655 * that limits the number of snapshots.
656 */
657 snap_count = le32_to_cpu(ondisk->snap_count);
658 size = SIZE_MAX - sizeof (struct ceph_snap_context);
659 if (snap_count > size / sizeof (__le64))
660 return false;
661
662 /*
663 * Not only that, but the size of the entire the snapshot
664 * header must also be representable in a size_t.
665 */
666 size -= snap_count * sizeof (__le64);
667 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
668 return false;
669
670 return true;
8e94af8e
AE
671}
672
602adf40
YS
673/*
674 * Create a new header structure, translate header format from the on-disk
675 * header.
676 */
677static int rbd_header_from_disk(struct rbd_image_header *header,
4156d998 678 struct rbd_image_header_ondisk *ondisk)
602adf40 679{
ccece235 680 u32 snap_count;
58c17b0e 681 size_t len;
d2bb24e5 682 size_t size;
621901d6 683 u32 i;
602adf40 684
6a52325f
AE
685 memset(header, 0, sizeof (*header));
686
103a150f
AE
687 snap_count = le32_to_cpu(ondisk->snap_count);
688
58c17b0e
AE
689 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
690 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6a52325f 691 if (!header->object_prefix)
602adf40 692 return -ENOMEM;
58c17b0e
AE
693 memcpy(header->object_prefix, ondisk->object_prefix, len);
694 header->object_prefix[len] = '\0';
00f1f36f 695
602adf40 696 if (snap_count) {
f785cc1d
AE
697 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
698
621901d6
AE
699 /* Save a copy of the snapshot names */
700
f785cc1d
AE
701 if (snap_names_len > (u64) SIZE_MAX)
702 return -EIO;
703 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
602adf40 704 if (!header->snap_names)
6a52325f 705 goto out_err;
f785cc1d
AE
706 /*
707 * Note that rbd_dev_v1_header_read() guarantees
708 * the ondisk buffer we're working with has
709 * snap_names_len bytes beyond the end of the
710 * snapshot id array, this memcpy() is safe.
711 */
712 memcpy(header->snap_names, &ondisk->snaps[snap_count],
713 snap_names_len);
6a52325f 714
621901d6
AE
715 /* Record each snapshot's size */
716
d2bb24e5
AE
717 size = snap_count * sizeof (*header->snap_sizes);
718 header->snap_sizes = kmalloc(size, GFP_KERNEL);
602adf40 719 if (!header->snap_sizes)
6a52325f 720 goto out_err;
621901d6
AE
721 for (i = 0; i < snap_count; i++)
722 header->snap_sizes[i] =
723 le64_to_cpu(ondisk->snaps[i].image_size);
602adf40 724 } else {
ccece235 725 WARN_ON(ondisk->snap_names_len);
602adf40
YS
726 header->snap_names = NULL;
727 header->snap_sizes = NULL;
728 }
849b4260 729
34b13184 730 header->features = 0; /* No features support in v1 images */
602adf40
YS
731 header->obj_order = ondisk->options.order;
732 header->crypt_type = ondisk->options.crypt_type;
733 header->comp_type = ondisk->options.comp_type;
6a52325f 734
621901d6
AE
735 /* Allocate and fill in the snapshot context */
736
f84344f3 737 header->image_size = le64_to_cpu(ondisk->image_size);
6a52325f
AE
738 size = sizeof (struct ceph_snap_context);
739 size += snap_count * sizeof (header->snapc->snaps[0]);
740 header->snapc = kzalloc(size, GFP_KERNEL);
741 if (!header->snapc)
742 goto out_err;
602adf40
YS
743
744 atomic_set(&header->snapc->nref, 1);
505cbb9b 745 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 746 header->snapc->num_snaps = snap_count;
621901d6
AE
747 for (i = 0; i < snap_count; i++)
748 header->snapc->snaps[i] =
749 le64_to_cpu(ondisk->snaps[i].id);
602adf40
YS
750
751 return 0;
752
6a52325f 753out_err:
849b4260 754 kfree(header->snap_sizes);
ccece235 755 header->snap_sizes = NULL;
602adf40 756 kfree(header->snap_names);
ccece235 757 header->snap_names = NULL;
6a52325f
AE
758 kfree(header->object_prefix);
759 header->object_prefix = NULL;
ccece235 760
00f1f36f 761 return -ENOMEM;
602adf40
YS
762}
763
9e15b77d
AE
764static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
765{
766 struct rbd_snap *snap;
767
768 if (snap_id == CEPH_NOSNAP)
769 return RBD_SNAP_HEAD_NAME;
770
771 list_for_each_entry(snap, &rbd_dev->snaps, node)
772 if (snap_id == snap->id)
773 return snap->name;
774
775 return NULL;
776}
777
8836b995 778static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
602adf40 779{
602adf40 780
e86924a8 781 struct rbd_snap *snap;
602adf40 782
e86924a8
AE
783 list_for_each_entry(snap, &rbd_dev->snaps, node) {
784 if (!strcmp(snap_name, snap->name)) {
0d7dbfce 785 rbd_dev->spec->snap_id = snap->id;
e86924a8 786 rbd_dev->mapping.size = snap->size;
34b13184 787 rbd_dev->mapping.features = snap->features;
602adf40 788
e86924a8 789 return 0;
00f1f36f 790 }
00f1f36f 791 }
e86924a8 792
00f1f36f 793 return -ENOENT;
602adf40
YS
794}
795
819d52bf 796static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
602adf40 797{
78dc447d 798 int ret;
602adf40 799
0d7dbfce 800 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
cc9d734c 801 sizeof (RBD_SNAP_HEAD_NAME))) {
0d7dbfce 802 rbd_dev->spec->snap_id = CEPH_NOSNAP;
99c1f08f 803 rbd_dev->mapping.size = rbd_dev->header.image_size;
34b13184 804 rbd_dev->mapping.features = rbd_dev->header.features;
e86924a8 805 ret = 0;
602adf40 806 } else {
0d7dbfce 807 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
602adf40
YS
808 if (ret < 0)
809 goto done;
f84344f3 810 rbd_dev->mapping.read_only = true;
602adf40 811 }
6d292906
AE
812 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
813
602adf40 814done:
602adf40
YS
815 return ret;
816}
817
818static void rbd_header_free(struct rbd_image_header *header)
819{
849b4260 820 kfree(header->object_prefix);
d78fd7ae 821 header->object_prefix = NULL;
602adf40 822 kfree(header->snap_sizes);
d78fd7ae 823 header->snap_sizes = NULL;
849b4260 824 kfree(header->snap_names);
d78fd7ae 825 header->snap_names = NULL;
d1d25646 826 ceph_put_snap_context(header->snapc);
d78fd7ae 827 header->snapc = NULL;
602adf40
YS
828}
829
98571b5a 830static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
602adf40 831{
65ccfe21
AE
832 char *name;
833 u64 segment;
834 int ret;
602adf40 835
2fd82b9e 836 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
65ccfe21
AE
837 if (!name)
838 return NULL;
839 segment = offset >> rbd_dev->header.obj_order;
2fd82b9e 840 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
65ccfe21 841 rbd_dev->header.object_prefix, segment);
2fd82b9e 842 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
65ccfe21
AE
843 pr_err("error formatting segment name for #%llu (%d)\n",
844 segment, ret);
845 kfree(name);
846 name = NULL;
847 }
602adf40 848
65ccfe21
AE
849 return name;
850}
602adf40 851
65ccfe21
AE
852static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
853{
854 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
602adf40 855
65ccfe21
AE
856 return offset & (segment_size - 1);
857}
858
859static u64 rbd_segment_length(struct rbd_device *rbd_dev,
860 u64 offset, u64 length)
861{
862 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
863
864 offset &= segment_size - 1;
865
aafb230e 866 rbd_assert(length <= U64_MAX - offset);
65ccfe21
AE
867 if (offset + length > segment_size)
868 length = segment_size - offset;
869
870 return length;
602adf40
YS
871}
872
029bcbd8
JD
873/*
874 * returns the size of an object in the image
875 */
876static u64 rbd_obj_bytes(struct rbd_image_header *header)
877{
878 return 1 << header->obj_order;
879}
880
602adf40
YS
881/*
882 * bio helpers
883 */
884
885static void bio_chain_put(struct bio *chain)
886{
887 struct bio *tmp;
888
889 while (chain) {
890 tmp = chain;
891 chain = chain->bi_next;
892 bio_put(tmp);
893 }
894}
895
896/*
897 * zeros a bio chain, starting at specific offset
898 */
899static void zero_bio_chain(struct bio *chain, int start_ofs)
900{
901 struct bio_vec *bv;
902 unsigned long flags;
903 void *buf;
904 int i;
905 int pos = 0;
906
907 while (chain) {
908 bio_for_each_segment(bv, chain, i) {
909 if (pos + bv->bv_len > start_ofs) {
910 int remainder = max(start_ofs - pos, 0);
911 buf = bvec_kmap_irq(bv, &flags);
912 memset(buf + remainder, 0,
913 bv->bv_len - remainder);
85b5aaa6 914 bvec_kunmap_irq(buf, &flags);
602adf40
YS
915 }
916 pos += bv->bv_len;
917 }
918
919 chain = chain->bi_next;
920 }
921}
922
923/*
f7760dad
AE
924 * Clone a portion of a bio, starting at the given byte offset
925 * and continuing for the number of bytes indicated.
602adf40 926 */
f7760dad
AE
927static struct bio *bio_clone_range(struct bio *bio_src,
928 unsigned int offset,
929 unsigned int len,
930 gfp_t gfpmask)
602adf40 931{
f7760dad
AE
932 struct bio_vec *bv;
933 unsigned int resid;
934 unsigned short idx;
935 unsigned int voff;
936 unsigned short end_idx;
937 unsigned short vcnt;
938 struct bio *bio;
939
940 /* Handle the easy case for the caller */
941
942 if (!offset && len == bio_src->bi_size)
943 return bio_clone(bio_src, gfpmask);
944
945 if (WARN_ON_ONCE(!len))
946 return NULL;
947 if (WARN_ON_ONCE(len > bio_src->bi_size))
948 return NULL;
949 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
950 return NULL;
951
952 /* Find first affected segment... */
953
954 resid = offset;
955 __bio_for_each_segment(bv, bio_src, idx, 0) {
956 if (resid < bv->bv_len)
957 break;
958 resid -= bv->bv_len;
602adf40 959 }
f7760dad 960 voff = resid;
602adf40 961
f7760dad 962 /* ...and the last affected segment */
602adf40 963
f7760dad
AE
964 resid += len;
965 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
966 if (resid <= bv->bv_len)
967 break;
968 resid -= bv->bv_len;
969 }
970 vcnt = end_idx - idx + 1;
971
972 /* Build the clone */
973
974 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
975 if (!bio)
976 return NULL; /* ENOMEM */
602adf40 977
f7760dad
AE
978 bio->bi_bdev = bio_src->bi_bdev;
979 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
980 bio->bi_rw = bio_src->bi_rw;
981 bio->bi_flags |= 1 << BIO_CLONED;
982
983 /*
984 * Copy over our part of the bio_vec, then update the first
985 * and last (or only) entries.
986 */
987 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
988 vcnt * sizeof (struct bio_vec));
989 bio->bi_io_vec[0].bv_offset += voff;
990 if (vcnt > 1) {
991 bio->bi_io_vec[0].bv_len -= voff;
992 bio->bi_io_vec[vcnt - 1].bv_len = resid;
993 } else {
994 bio->bi_io_vec[0].bv_len = len;
602adf40
YS
995 }
996
f7760dad
AE
997 bio->bi_vcnt = vcnt;
998 bio->bi_size = len;
999 bio->bi_idx = 0;
1000
1001 return bio;
1002}
1003
1004/*
1005 * Clone a portion of a bio chain, starting at the given byte offset
1006 * into the first bio in the source chain and continuing for the
1007 * number of bytes indicated. The result is another bio chain of
1008 * exactly the given length, or a null pointer on error.
1009 *
1010 * The bio_src and offset parameters are both in-out. On entry they
1011 * refer to the first source bio and the offset into that bio where
1012 * the start of data to be cloned is located.
1013 *
1014 * On return, bio_src is updated to refer to the bio in the source
1015 * chain that contains first un-cloned byte, and *offset will
1016 * contain the offset of that byte within that bio.
1017 */
1018static struct bio *bio_chain_clone_range(struct bio **bio_src,
1019 unsigned int *offset,
1020 unsigned int len,
1021 gfp_t gfpmask)
1022{
1023 struct bio *bi = *bio_src;
1024 unsigned int off = *offset;
1025 struct bio *chain = NULL;
1026 struct bio **end;
1027
1028 /* Build up a chain of clone bios up to the limit */
1029
1030 if (!bi || off >= bi->bi_size || !len)
1031 return NULL; /* Nothing to clone */
602adf40 1032
f7760dad
AE
1033 end = &chain;
1034 while (len) {
1035 unsigned int bi_size;
1036 struct bio *bio;
1037
f5400b7a
AE
1038 if (!bi) {
1039 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
f7760dad 1040 goto out_err; /* EINVAL; ran out of bio's */
f5400b7a 1041 }
f7760dad
AE
1042 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1043 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1044 if (!bio)
1045 goto out_err; /* ENOMEM */
1046
1047 *end = bio;
1048 end = &bio->bi_next;
602adf40 1049
f7760dad
AE
1050 off += bi_size;
1051 if (off == bi->bi_size) {
1052 bi = bi->bi_next;
1053 off = 0;
1054 }
1055 len -= bi_size;
1056 }
1057 *bio_src = bi;
1058 *offset = off;
1059
1060 return chain;
1061out_err:
1062 bio_chain_put(chain);
602adf40 1063
602adf40
YS
1064 return NULL;
1065}
1066
bf0d5f50
AE
1067static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1068{
37206ee5
AE
1069 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1070 atomic_read(&obj_request->kref.refcount));
bf0d5f50
AE
1071 kref_get(&obj_request->kref);
1072}
1073
1074static void rbd_obj_request_destroy(struct kref *kref);
1075static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1076{
1077 rbd_assert(obj_request != NULL);
37206ee5
AE
1078 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1079 atomic_read(&obj_request->kref.refcount));
bf0d5f50
AE
1080 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1081}
1082
1083static void rbd_img_request_get(struct rbd_img_request *img_request)
1084{
37206ee5
AE
1085 dout("%s: img %p (was %d)\n", __func__, img_request,
1086 atomic_read(&img_request->kref.refcount));
bf0d5f50
AE
1087 kref_get(&img_request->kref);
1088}
1089
1090static void rbd_img_request_destroy(struct kref *kref);
1091static void rbd_img_request_put(struct rbd_img_request *img_request)
1092{
1093 rbd_assert(img_request != NULL);
37206ee5
AE
1094 dout("%s: img %p (was %d)\n", __func__, img_request,
1095 atomic_read(&img_request->kref.refcount));
bf0d5f50
AE
1096 kref_put(&img_request->kref, rbd_img_request_destroy);
1097}
1098
1099static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1100 struct rbd_obj_request *obj_request)
1101{
25dcf954
AE
1102 rbd_assert(obj_request->img_request == NULL);
1103
bf0d5f50
AE
1104 rbd_obj_request_get(obj_request);
1105 obj_request->img_request = img_request;
25dcf954 1106 obj_request->which = img_request->obj_request_count;
bf0d5f50 1107 rbd_assert(obj_request->which != BAD_WHICH);
25dcf954
AE
1108 img_request->obj_request_count++;
1109 list_add_tail(&obj_request->links, &img_request->obj_requests);
37206ee5
AE
1110 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1111 obj_request->which);
bf0d5f50
AE
1112}
1113
1114static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1115 struct rbd_obj_request *obj_request)
1116{
1117 rbd_assert(obj_request->which != BAD_WHICH);
25dcf954 1118
37206ee5
AE
1119 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1120 obj_request->which);
bf0d5f50 1121 list_del(&obj_request->links);
25dcf954
AE
1122 rbd_assert(img_request->obj_request_count > 0);
1123 img_request->obj_request_count--;
1124 rbd_assert(obj_request->which == img_request->obj_request_count);
1125 obj_request->which = BAD_WHICH;
bf0d5f50 1126 rbd_assert(obj_request->img_request == img_request);
bf0d5f50 1127 obj_request->img_request = NULL;
25dcf954 1128 obj_request->callback = NULL;
bf0d5f50
AE
1129 rbd_obj_request_put(obj_request);
1130}
1131
1132static bool obj_request_type_valid(enum obj_request_type type)
1133{
1134 switch (type) {
9969ebc5 1135 case OBJ_REQUEST_NODATA:
bf0d5f50 1136 case OBJ_REQUEST_BIO:
788e2df3 1137 case OBJ_REQUEST_PAGES:
bf0d5f50
AE
1138 return true;
1139 default:
1140 return false;
1141 }
1142}
1143
cc344fa1 1144static struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
8d23bf29
AE
1145{
1146 struct ceph_osd_req_op *op;
1147 va_list args;
2647ba38 1148 size_t size;
8d23bf29
AE
1149
1150 op = kzalloc(sizeof (*op), GFP_NOIO);
1151 if (!op)
1152 return NULL;
1153 op->op = opcode;
1154 va_start(args, opcode);
1155 switch (opcode) {
1156 case CEPH_OSD_OP_READ:
1157 case CEPH_OSD_OP_WRITE:
1158 /* rbd_osd_req_op_create(READ, offset, length) */
1159 /* rbd_osd_req_op_create(WRITE, offset, length) */
1160 op->extent.offset = va_arg(args, u64);
1161 op->extent.length = va_arg(args, u64);
1162 if (opcode == CEPH_OSD_OP_WRITE)
1163 op->payload_len = op->extent.length;
1164 break;
fbfab539
AE
1165 case CEPH_OSD_OP_STAT:
1166 break;
2647ba38
AE
1167 case CEPH_OSD_OP_CALL:
1168 /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
1169 op->cls.class_name = va_arg(args, char *);
1170 size = strlen(op->cls.class_name);
1171 rbd_assert(size <= (size_t) U8_MAX);
1172 op->cls.class_len = size;
1173 op->payload_len = size;
1174
1175 op->cls.method_name = va_arg(args, char *);
1176 size = strlen(op->cls.method_name);
1177 rbd_assert(size <= (size_t) U8_MAX);
1178 op->cls.method_len = size;
1179 op->payload_len += size;
1180
1181 op->cls.argc = 0;
1182 op->cls.indata = va_arg(args, void *);
1183 size = va_arg(args, size_t);
1184 rbd_assert(size <= (size_t) U32_MAX);
1185 op->cls.indata_len = (u32) size;
1186 op->payload_len += size;
1187 break;
5efea49a
AE
1188 case CEPH_OSD_OP_NOTIFY_ACK:
1189 case CEPH_OSD_OP_WATCH:
1190 /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
1191 /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
1192 op->watch.cookie = va_arg(args, u64);
1193 op->watch.ver = va_arg(args, u64);
1194 op->watch.ver = cpu_to_le64(op->watch.ver);
1195 if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
1196 op->watch.flag = (u8) 1;
1197 break;
8d23bf29
AE
1198 default:
1199 rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
1200 kfree(op);
1201 op = NULL;
1202 break;
1203 }
1204 va_end(args);
1205
1206 return op;
1207}
1208
1209static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
1210{
1211 kfree(op);
1212}
1213
bf0d5f50
AE
1214static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1215 struct rbd_obj_request *obj_request)
1216{
37206ee5
AE
1217 dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1218
bf0d5f50
AE
1219 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1220}
1221
1222static void rbd_img_request_complete(struct rbd_img_request *img_request)
1223{
37206ee5 1224 dout("%s: img %p\n", __func__, img_request);
bf0d5f50
AE
1225 if (img_request->callback)
1226 img_request->callback(img_request);
1227 else
1228 rbd_img_request_put(img_request);
1229}
1230
788e2df3
AE
1231/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1232
1233static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1234{
37206ee5
AE
1235 dout("%s: obj %p\n", __func__, obj_request);
1236
788e2df3
AE
1237 return wait_for_completion_interruptible(&obj_request->completion);
1238}
1239
07741308
AE
1240static void obj_request_done_init(struct rbd_obj_request *obj_request)
1241{
1242 atomic_set(&obj_request->done, 0);
1243 smp_wmb();
1244}
1245
1246static void obj_request_done_set(struct rbd_obj_request *obj_request)
1247{
632b88ca
AE
1248 int done;
1249
1250 done = atomic_inc_return(&obj_request->done);
1251 if (done > 1) {
1252 struct rbd_img_request *img_request = obj_request->img_request;
1253 struct rbd_device *rbd_dev;
1254
1255 rbd_dev = img_request ? img_request->rbd_dev : NULL;
1256 rbd_warn(rbd_dev, "obj_request %p was already done\n",
1257 obj_request);
1258 }
07741308
AE
1259}
1260
1261static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1262{
632b88ca 1263 smp_mb();
07741308
AE
1264 return atomic_read(&obj_request->done) != 0;
1265}
1266
6e2a4505
AE
1267static void
1268rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1269{
1270 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1271 obj_request, obj_request->img_request, obj_request->result,
1272 obj_request->xferred, obj_request->length);
1273 /*
1274 * ENOENT means a hole in the image. We zero-fill the
1275 * entire length of the request. A short read also implies
1276 * zero-fill to the end of the request. Either way we
1277 * update the xferred count to indicate the whole request
1278 * was satisfied.
1279 */
1280 BUG_ON(obj_request->type != OBJ_REQUEST_BIO);
1281 if (obj_request->result == -ENOENT) {
1282 zero_bio_chain(obj_request->bio_list, 0);
1283 obj_request->result = 0;
1284 obj_request->xferred = obj_request->length;
1285 } else if (obj_request->xferred < obj_request->length &&
1286 !obj_request->result) {
1287 zero_bio_chain(obj_request->bio_list, obj_request->xferred);
1288 obj_request->xferred = obj_request->length;
1289 }
1290 obj_request_done_set(obj_request);
1291}
1292
bf0d5f50
AE
1293static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1294{
37206ee5
AE
1295 dout("%s: obj %p cb %p\n", __func__, obj_request,
1296 obj_request->callback);
bf0d5f50
AE
1297 if (obj_request->callback)
1298 obj_request->callback(obj_request);
788e2df3
AE
1299 else
1300 complete_all(&obj_request->completion);
bf0d5f50
AE
1301}
1302
c47f9371 1303static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
39bf2c5d
AE
1304{
1305 dout("%s: obj %p\n", __func__, obj_request);
1306 obj_request_done_set(obj_request);
1307}
1308
c47f9371 1309static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
bf0d5f50 1310{
37206ee5 1311 dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
c47f9371 1312 obj_request->result, obj_request->xferred, obj_request->length);
6e2a4505
AE
1313 if (obj_request->img_request)
1314 rbd_img_obj_request_read_callback(obj_request);
1315 else
1316 obj_request_done_set(obj_request);
bf0d5f50
AE
1317}
1318
c47f9371 1319static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
bf0d5f50 1320{
1b83bef2
SW
1321 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1322 obj_request->result, obj_request->length);
1323 /*
1324 * There is no such thing as a successful short write.
1325 * Our xferred value is the number of bytes transferred
1326 * back. Set it to our originally-requested length.
1327 */
1328 obj_request->xferred = obj_request->length;
07741308 1329 obj_request_done_set(obj_request);
bf0d5f50
AE
1330}
1331
fbfab539
AE
1332/*
1333 * For a simple stat call there's nothing to do. We'll do more if
1334 * this is part of a write sequence for a layered image.
1335 */
c47f9371 1336static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
fbfab539 1337{
37206ee5 1338 dout("%s: obj %p\n", __func__, obj_request);
fbfab539
AE
1339 obj_request_done_set(obj_request);
1340}
1341
bf0d5f50
AE
1342static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1343 struct ceph_msg *msg)
1344{
1345 struct rbd_obj_request *obj_request = osd_req->r_priv;
bf0d5f50
AE
1346 u16 opcode;
1347
37206ee5 1348 dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
bf0d5f50
AE
1349 rbd_assert(osd_req == obj_request->osd_req);
1350 rbd_assert(!!obj_request->img_request ^
1351 (obj_request->which == BAD_WHICH));
1352
1b83bef2
SW
1353 if (osd_req->r_result < 0)
1354 obj_request->result = osd_req->r_result;
bf0d5f50
AE
1355 obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1356
1b83bef2 1357 WARN_ON(osd_req->r_num_ops != 1); /* For now */
bf0d5f50 1358
c47f9371
AE
1359 /*
1360 * We support a 64-bit length, but ultimately it has to be
1361 * passed to blk_end_request(), which takes an unsigned int.
1362 */
1b83bef2 1363 obj_request->xferred = osd_req->r_reply_op_len[0];
c47f9371 1364 rbd_assert(obj_request->xferred < (u64) UINT_MAX);
1b83bef2 1365 opcode = osd_req->r_request_ops[0].op;
bf0d5f50
AE
1366 switch (opcode) {
1367 case CEPH_OSD_OP_READ:
c47f9371 1368 rbd_osd_read_callback(obj_request);
bf0d5f50
AE
1369 break;
1370 case CEPH_OSD_OP_WRITE:
c47f9371 1371 rbd_osd_write_callback(obj_request);
bf0d5f50 1372 break;
fbfab539 1373 case CEPH_OSD_OP_STAT:
c47f9371 1374 rbd_osd_stat_callback(obj_request);
fbfab539 1375 break;
36be9a76 1376 case CEPH_OSD_OP_CALL:
b8d70035 1377 case CEPH_OSD_OP_NOTIFY_ACK:
9969ebc5 1378 case CEPH_OSD_OP_WATCH:
c47f9371 1379 rbd_osd_trivial_callback(obj_request);
9969ebc5 1380 break;
bf0d5f50
AE
1381 default:
1382 rbd_warn(NULL, "%s: unsupported op %hu\n",
1383 obj_request->object_name, (unsigned short) opcode);
1384 break;
1385 }
1386
07741308 1387 if (obj_request_done_test(obj_request))
bf0d5f50
AE
1388 rbd_obj_request_complete(obj_request);
1389}
1390
1391static struct ceph_osd_request *rbd_osd_req_create(
1392 struct rbd_device *rbd_dev,
1393 bool write_request,
1394 struct rbd_obj_request *obj_request,
1395 struct ceph_osd_req_op *op)
1396{
1397 struct rbd_img_request *img_request = obj_request->img_request;
1398 struct ceph_snap_context *snapc = NULL;
1399 struct ceph_osd_client *osdc;
1400 struct ceph_osd_request *osd_req;
1401 struct timespec now;
1402 struct timespec *mtime;
1403 u64 snap_id = CEPH_NOSNAP;
1404 u64 offset = obj_request->offset;
1405 u64 length = obj_request->length;
1406
1407 if (img_request) {
1408 rbd_assert(img_request->write_request == write_request);
1409 if (img_request->write_request)
1410 snapc = img_request->snapc;
1411 else
1412 snap_id = img_request->snap_id;
1413 }
1414
1415 /* Allocate and initialize the request, for the single op */
1416
1417 osdc = &rbd_dev->rbd_client->client->osdc;
1418 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1419 if (!osd_req)
1420 return NULL; /* ENOMEM */
1421
1422 rbd_assert(obj_request_type_valid(obj_request->type));
1423 switch (obj_request->type) {
9969ebc5
AE
1424 case OBJ_REQUEST_NODATA:
1425 break; /* Nothing to do */
bf0d5f50
AE
1426 case OBJ_REQUEST_BIO:
1427 rbd_assert(obj_request->bio_list != NULL);
1428 osd_req->r_bio = obj_request->bio_list;
bf0d5f50 1429 break;
788e2df3
AE
1430 case OBJ_REQUEST_PAGES:
1431 osd_req->r_pages = obj_request->pages;
1432 osd_req->r_num_pages = obj_request->page_count;
1433 osd_req->r_page_alignment = offset & ~PAGE_MASK;
1434 break;
bf0d5f50
AE
1435 }
1436
1437 if (write_request) {
1438 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1439 now = CURRENT_TIME;
1440 mtime = &now;
1441 } else {
1442 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1443 mtime = NULL; /* not needed for reads */
1444 offset = 0; /* These are not used... */
1445 length = 0; /* ...for osd read requests */
1446 }
1447
1448 osd_req->r_callback = rbd_osd_req_callback;
1449 osd_req->r_priv = obj_request;
1450
1451 osd_req->r_oid_len = strlen(obj_request->object_name);
1452 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1453 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1454
1455 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1456
1457 /* osd_req will get its own reference to snapc (if non-null) */
1458
1459 ceph_osdc_build_request(osd_req, offset, length, 1, op,
1460 snapc, snap_id, mtime);
1461
1462 return osd_req;
1463}
1464
1465static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1466{
1467 ceph_osdc_put_request(osd_req);
1468}
1469
1470/* object_name is assumed to be a non-null pointer and NUL-terminated */
1471
1472static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1473 u64 offset, u64 length,
1474 enum obj_request_type type)
1475{
1476 struct rbd_obj_request *obj_request;
1477 size_t size;
1478 char *name;
1479
1480 rbd_assert(obj_request_type_valid(type));
1481
1482 size = strlen(object_name) + 1;
1483 obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1484 if (!obj_request)
1485 return NULL;
1486
1487 name = (char *)(obj_request + 1);
1488 obj_request->object_name = memcpy(name, object_name, size);
1489 obj_request->offset = offset;
1490 obj_request->length = length;
1491 obj_request->which = BAD_WHICH;
1492 obj_request->type = type;
1493 INIT_LIST_HEAD(&obj_request->links);
07741308 1494 obj_request_done_init(obj_request);
788e2df3 1495 init_completion(&obj_request->completion);
bf0d5f50
AE
1496 kref_init(&obj_request->kref);
1497
37206ee5
AE
1498 dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1499 offset, length, (int)type, obj_request);
1500
bf0d5f50
AE
1501 return obj_request;
1502}
1503
1504static void rbd_obj_request_destroy(struct kref *kref)
1505{
1506 struct rbd_obj_request *obj_request;
1507
1508 obj_request = container_of(kref, struct rbd_obj_request, kref);
1509
37206ee5
AE
1510 dout("%s: obj %p\n", __func__, obj_request);
1511
bf0d5f50
AE
1512 rbd_assert(obj_request->img_request == NULL);
1513 rbd_assert(obj_request->which == BAD_WHICH);
1514
1515 if (obj_request->osd_req)
1516 rbd_osd_req_destroy(obj_request->osd_req);
1517
1518 rbd_assert(obj_request_type_valid(obj_request->type));
1519 switch (obj_request->type) {
9969ebc5
AE
1520 case OBJ_REQUEST_NODATA:
1521 break; /* Nothing to do */
bf0d5f50
AE
1522 case OBJ_REQUEST_BIO:
1523 if (obj_request->bio_list)
1524 bio_chain_put(obj_request->bio_list);
1525 break;
788e2df3
AE
1526 case OBJ_REQUEST_PAGES:
1527 if (obj_request->pages)
1528 ceph_release_page_vector(obj_request->pages,
1529 obj_request->page_count);
1530 break;
bf0d5f50
AE
1531 }
1532
1533 kfree(obj_request);
1534}
1535
1536/*
1537 * Caller is responsible for filling in the list of object requests
1538 * that comprises the image request, and the Linux request pointer
1539 * (if there is one).
1540 */
cc344fa1
AE
1541static struct rbd_img_request *rbd_img_request_create(
1542 struct rbd_device *rbd_dev,
bf0d5f50
AE
1543 u64 offset, u64 length,
1544 bool write_request)
1545{
1546 struct rbd_img_request *img_request;
1547 struct ceph_snap_context *snapc = NULL;
1548
1549 img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1550 if (!img_request)
1551 return NULL;
1552
1553 if (write_request) {
1554 down_read(&rbd_dev->header_rwsem);
1555 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1556 up_read(&rbd_dev->header_rwsem);
1557 if (WARN_ON(!snapc)) {
1558 kfree(img_request);
1559 return NULL; /* Shouldn't happen */
1560 }
1561 }
1562
1563 img_request->rq = NULL;
1564 img_request->rbd_dev = rbd_dev;
1565 img_request->offset = offset;
1566 img_request->length = length;
1567 img_request->write_request = write_request;
1568 if (write_request)
1569 img_request->snapc = snapc;
1570 else
1571 img_request->snap_id = rbd_dev->spec->snap_id;
1572 spin_lock_init(&img_request->completion_lock);
1573 img_request->next_completion = 0;
1574 img_request->callback = NULL;
1575 img_request->obj_request_count = 0;
1576 INIT_LIST_HEAD(&img_request->obj_requests);
1577 kref_init(&img_request->kref);
1578
1579 rbd_img_request_get(img_request); /* Avoid a warning */
1580 rbd_img_request_put(img_request); /* TEMPORARY */
1581
37206ee5
AE
1582 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
1583 write_request ? "write" : "read", offset, length,
1584 img_request);
1585
bf0d5f50
AE
1586 return img_request;
1587}
1588
1589static void rbd_img_request_destroy(struct kref *kref)
1590{
1591 struct rbd_img_request *img_request;
1592 struct rbd_obj_request *obj_request;
1593 struct rbd_obj_request *next_obj_request;
1594
1595 img_request = container_of(kref, struct rbd_img_request, kref);
1596
37206ee5
AE
1597 dout("%s: img %p\n", __func__, img_request);
1598
bf0d5f50
AE
1599 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1600 rbd_img_obj_request_del(img_request, obj_request);
25dcf954 1601 rbd_assert(img_request->obj_request_count == 0);
bf0d5f50
AE
1602
1603 if (img_request->write_request)
1604 ceph_put_snap_context(img_request->snapc);
1605
1606 kfree(img_request);
1607}
1608
1609static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1610 struct bio *bio_list)
1611{
1612 struct rbd_device *rbd_dev = img_request->rbd_dev;
1613 struct rbd_obj_request *obj_request = NULL;
1614 struct rbd_obj_request *next_obj_request;
1615 unsigned int bio_offset;
1616 u64 image_offset;
1617 u64 resid;
1618 u16 opcode;
1619
37206ee5
AE
1620 dout("%s: img %p bio %p\n", __func__, img_request, bio_list);
1621
bf0d5f50
AE
1622 opcode = img_request->write_request ? CEPH_OSD_OP_WRITE
1623 : CEPH_OSD_OP_READ;
1624 bio_offset = 0;
1625 image_offset = img_request->offset;
1626 rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
1627 resid = img_request->length;
4dda41d3 1628 rbd_assert(resid > 0);
bf0d5f50
AE
1629 while (resid) {
1630 const char *object_name;
1631 unsigned int clone_size;
1632 struct ceph_osd_req_op *op;
1633 u64 offset;
1634 u64 length;
1635
1636 object_name = rbd_segment_name(rbd_dev, image_offset);
1637 if (!object_name)
1638 goto out_unwind;
1639 offset = rbd_segment_offset(rbd_dev, image_offset);
1640 length = rbd_segment_length(rbd_dev, image_offset, resid);
1641 obj_request = rbd_obj_request_create(object_name,
1642 offset, length,
1643 OBJ_REQUEST_BIO);
1644 kfree(object_name); /* object request has its own copy */
1645 if (!obj_request)
1646 goto out_unwind;
1647
1648 rbd_assert(length <= (u64) UINT_MAX);
1649 clone_size = (unsigned int) length;
1650 obj_request->bio_list = bio_chain_clone_range(&bio_list,
1651 &bio_offset, clone_size,
1652 GFP_ATOMIC);
1653 if (!obj_request->bio_list)
1654 goto out_partial;
1655
1656 /*
1657 * Build up the op to use in building the osd
1658 * request. Note that the contents of the op are
1659 * copied by rbd_osd_req_create().
1660 */
1661 op = rbd_osd_req_op_create(opcode, offset, length);
1662 if (!op)
1663 goto out_partial;
1664 obj_request->osd_req = rbd_osd_req_create(rbd_dev,
1665 img_request->write_request,
1666 obj_request, op);
1667 rbd_osd_req_op_destroy(op);
1668 if (!obj_request->osd_req)
1669 goto out_partial;
1670 /* status and version are initially zero-filled */
1671
1672 rbd_img_obj_request_add(img_request, obj_request);
1673
1674 image_offset += length;
1675 resid -= length;
1676 }
1677
1678 return 0;
1679
1680out_partial:
1681 rbd_obj_request_put(obj_request);
1682out_unwind:
1683 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1684 rbd_obj_request_put(obj_request);
1685
1686 return -ENOMEM;
1687}
1688
1689static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1690{
1691 struct rbd_img_request *img_request;
1692 u32 which = obj_request->which;
1693 bool more = true;
1694
1695 img_request = obj_request->img_request;
4dda41d3 1696
37206ee5 1697 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
bf0d5f50
AE
1698 rbd_assert(img_request != NULL);
1699 rbd_assert(img_request->rq != NULL);
4dda41d3 1700 rbd_assert(img_request->obj_request_count > 0);
bf0d5f50
AE
1701 rbd_assert(which != BAD_WHICH);
1702 rbd_assert(which < img_request->obj_request_count);
1703 rbd_assert(which >= img_request->next_completion);
1704
1705 spin_lock_irq(&img_request->completion_lock);
1706 if (which != img_request->next_completion)
1707 goto out;
1708
1709 for_each_obj_request_from(img_request, obj_request) {
1710 unsigned int xferred;
1711 int result;
1712
1713 rbd_assert(more);
1714 rbd_assert(which < img_request->obj_request_count);
1715
07741308 1716 if (!obj_request_done_test(obj_request))
bf0d5f50
AE
1717 break;
1718
1719 rbd_assert(obj_request->xferred <= (u64) UINT_MAX);
1720 xferred = (unsigned int) obj_request->xferred;
1721 result = (int) obj_request->result;
1722 if (result)
1723 rbd_warn(NULL, "obj_request %s result %d xferred %u\n",
1724 img_request->write_request ? "write" : "read",
1725 result, xferred);
1726
1727 more = blk_end_request(img_request->rq, result, xferred);
1728 which++;
1729 }
1b83bef2 1730
bf0d5f50
AE
1731 rbd_assert(more ^ (which == img_request->obj_request_count));
1732 img_request->next_completion = which;
1733out:
1734 spin_unlock_irq(&img_request->completion_lock);
1735
1736 if (!more)
1737 rbd_img_request_complete(img_request);
1738}
1739
1740static int rbd_img_request_submit(struct rbd_img_request *img_request)
1741{
1742 struct rbd_device *rbd_dev = img_request->rbd_dev;
1743 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1744 struct rbd_obj_request *obj_request;
1745
37206ee5 1746 dout("%s: img %p\n", __func__, img_request);
bf0d5f50
AE
1747 for_each_obj_request(img_request, obj_request) {
1748 int ret;
1749
1750 obj_request->callback = rbd_img_obj_callback;
1751 ret = rbd_obj_request_submit(osdc, obj_request);
1752 if (ret)
1753 return ret;
1754 /*
1755 * The image request has its own reference to each
1756 * of its object requests, so we can safely drop the
1757 * initial one here.
1758 */
1759 rbd_obj_request_put(obj_request);
1760 }
1761
1762 return 0;
1763}
1764
cf81b60e 1765static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
b8d70035
AE
1766 u64 ver, u64 notify_id)
1767{
1768 struct rbd_obj_request *obj_request;
1769 struct ceph_osd_req_op *op;
1770 struct ceph_osd_client *osdc;
1771 int ret;
1772
1773 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1774 OBJ_REQUEST_NODATA);
1775 if (!obj_request)
1776 return -ENOMEM;
1777
1778 ret = -ENOMEM;
1779 op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
1780 if (!op)
1781 goto out;
1782 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1783 obj_request, op);
1784 rbd_osd_req_op_destroy(op);
1785 if (!obj_request->osd_req)
1786 goto out;
1787
1788 osdc = &rbd_dev->rbd_client->client->osdc;
cf81b60e 1789 obj_request->callback = rbd_obj_request_put;
b8d70035 1790 ret = rbd_obj_request_submit(osdc, obj_request);
b8d70035 1791out:
cf81b60e
AE
1792 if (ret)
1793 rbd_obj_request_put(obj_request);
b8d70035
AE
1794
1795 return ret;
1796}
1797
1798static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1799{
1800 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1801 u64 hver;
1802 int rc;
1803
1804 if (!rbd_dev)
1805 return;
1806
37206ee5 1807 dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
b8d70035
AE
1808 rbd_dev->header_name, (unsigned long long) notify_id,
1809 (unsigned int) opcode);
1810 rc = rbd_dev_refresh(rbd_dev, &hver);
1811 if (rc)
1812 rbd_warn(rbd_dev, "got notification but failed to "
1813 " update snaps: %d\n", rc);
1814
cf81b60e 1815 rbd_obj_notify_ack(rbd_dev, hver, notify_id);
b8d70035
AE
1816}
1817
9969ebc5
AE
1818/*
1819 * Request sync osd watch/unwatch. The value of "start" determines
1820 * whether a watch request is being initiated or torn down.
1821 */
1822static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
1823{
1824 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1825 struct rbd_obj_request *obj_request;
1826 struct ceph_osd_req_op *op;
1827 int ret;
1828
1829 rbd_assert(start ^ !!rbd_dev->watch_event);
1830 rbd_assert(start ^ !!rbd_dev->watch_request);
1831
1832 if (start) {
3c663bbd 1833 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
9969ebc5
AE
1834 &rbd_dev->watch_event);
1835 if (ret < 0)
1836 return ret;
8eb87565 1837 rbd_assert(rbd_dev->watch_event != NULL);
9969ebc5
AE
1838 }
1839
1840 ret = -ENOMEM;
1841 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1842 OBJ_REQUEST_NODATA);
1843 if (!obj_request)
1844 goto out_cancel;
1845
1846 op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
1847 rbd_dev->watch_event->cookie,
1848 rbd_dev->header.obj_version, start);
1849 if (!op)
1850 goto out_cancel;
1851 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true,
1852 obj_request, op);
1853 rbd_osd_req_op_destroy(op);
1854 if (!obj_request->osd_req)
1855 goto out_cancel;
1856
8eb87565 1857 if (start)
975241af 1858 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
8eb87565 1859 else
6977c3f9 1860 ceph_osdc_unregister_linger_request(osdc,
975241af 1861 rbd_dev->watch_request->osd_req);
9969ebc5
AE
1862 ret = rbd_obj_request_submit(osdc, obj_request);
1863 if (ret)
1864 goto out_cancel;
1865 ret = rbd_obj_request_wait(obj_request);
1866 if (ret)
1867 goto out_cancel;
9969ebc5
AE
1868 ret = obj_request->result;
1869 if (ret)
1870 goto out_cancel;
1871
8eb87565
AE
1872 /*
1873 * A watch request is set to linger, so the underlying osd
1874 * request won't go away until we unregister it. We retain
1875 * a pointer to the object request during that time (in
1876 * rbd_dev->watch_request), so we'll keep a reference to
1877 * it. We'll drop that reference (below) after we've
1878 * unregistered it.
1879 */
1880 if (start) {
1881 rbd_dev->watch_request = obj_request;
1882
1883 return 0;
1884 }
1885
1886 /* We have successfully torn down the watch request */
1887
1888 rbd_obj_request_put(rbd_dev->watch_request);
1889 rbd_dev->watch_request = NULL;
9969ebc5
AE
1890out_cancel:
1891 /* Cancel the event if we're tearing down, or on error */
1892 ceph_osdc_cancel_event(rbd_dev->watch_event);
1893 rbd_dev->watch_event = NULL;
9969ebc5
AE
1894 if (obj_request)
1895 rbd_obj_request_put(obj_request);
1896
1897 return ret;
1898}
1899
36be9a76
AE
1900/*
1901 * Synchronous osd object method call
1902 */
1903static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
1904 const char *object_name,
1905 const char *class_name,
1906 const char *method_name,
1907 const char *outbound,
1908 size_t outbound_size,
1909 char *inbound,
1910 size_t inbound_size,
1911 u64 *version)
1912{
1913 struct rbd_obj_request *obj_request;
1914 struct ceph_osd_client *osdc;
1915 struct ceph_osd_req_op *op;
1916 struct page **pages;
1917 u32 page_count;
1918 int ret;
1919
1920 /*
1921 * Method calls are ultimately read operations but they
1922 * don't involve object data (so no offset or length).
1923 * The result should placed into the inbound buffer
1924 * provided. They also supply outbound data--parameters for
1925 * the object method. Currently if this is present it will
1926 * be a snapshot id.
1927 */
1928 page_count = (u32) calc_pages_for(0, inbound_size);
1929 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
1930 if (IS_ERR(pages))
1931 return PTR_ERR(pages);
1932
1933 ret = -ENOMEM;
1934 obj_request = rbd_obj_request_create(object_name, 0, 0,
1935 OBJ_REQUEST_PAGES);
1936 if (!obj_request)
1937 goto out;
1938
1939 obj_request->pages = pages;
1940 obj_request->page_count = page_count;
1941
1942 op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
1943 method_name, outbound, outbound_size);
1944 if (!op)
1945 goto out;
1946 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1947 obj_request, op);
1948 rbd_osd_req_op_destroy(op);
1949 if (!obj_request->osd_req)
1950 goto out;
1951
1952 osdc = &rbd_dev->rbd_client->client->osdc;
1953 ret = rbd_obj_request_submit(osdc, obj_request);
1954 if (ret)
1955 goto out;
1956 ret = rbd_obj_request_wait(obj_request);
1957 if (ret)
1958 goto out;
1959
1960 ret = obj_request->result;
1961 if (ret < 0)
1962 goto out;
23ed6e13 1963 ret = 0;
903bb32e 1964 ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
36be9a76
AE
1965 if (version)
1966 *version = obj_request->version;
1967out:
1968 if (obj_request)
1969 rbd_obj_request_put(obj_request);
1970 else
1971 ceph_release_page_vector(pages, page_count);
1972
1973 return ret;
1974}
1975
bf0d5f50 1976static void rbd_request_fn(struct request_queue *q)
cc344fa1 1977 __releases(q->queue_lock) __acquires(q->queue_lock)
bf0d5f50
AE
1978{
1979 struct rbd_device *rbd_dev = q->queuedata;
1980 bool read_only = rbd_dev->mapping.read_only;
1981 struct request *rq;
1982 int result;
1983
1984 while ((rq = blk_fetch_request(q))) {
1985 bool write_request = rq_data_dir(rq) == WRITE;
1986 struct rbd_img_request *img_request;
1987 u64 offset;
1988 u64 length;
1989
1990 /* Ignore any non-FS requests that filter through. */
1991
1992 if (rq->cmd_type != REQ_TYPE_FS) {
4dda41d3
AE
1993 dout("%s: non-fs request type %d\n", __func__,
1994 (int) rq->cmd_type);
1995 __blk_end_request_all(rq, 0);
1996 continue;
1997 }
1998
1999 /* Ignore/skip any zero-length requests */
2000
2001 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
2002 length = (u64) blk_rq_bytes(rq);
2003
2004 if (!length) {
2005 dout("%s: zero-length request\n", __func__);
bf0d5f50
AE
2006 __blk_end_request_all(rq, 0);
2007 continue;
2008 }
2009
2010 spin_unlock_irq(q->queue_lock);
2011
2012 /* Disallow writes to a read-only device */
2013
2014 if (write_request) {
2015 result = -EROFS;
2016 if (read_only)
2017 goto end_request;
2018 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2019 }
2020
6d292906
AE
2021 /*
2022 * Quit early if the mapped snapshot no longer
2023 * exists. It's still possible the snapshot will
2024 * have disappeared by the time our request arrives
2025 * at the osd, but there's no sense in sending it if
2026 * we already know.
2027 */
2028 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
bf0d5f50
AE
2029 dout("request for non-existent snapshot");
2030 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2031 result = -ENXIO;
2032 goto end_request;
2033 }
2034
bf0d5f50
AE
2035 result = -EINVAL;
2036 if (WARN_ON(offset && length > U64_MAX - offset + 1))
2037 goto end_request; /* Shouldn't happen */
2038
2039 result = -ENOMEM;
2040 img_request = rbd_img_request_create(rbd_dev, offset, length,
2041 write_request);
2042 if (!img_request)
2043 goto end_request;
2044
2045 img_request->rq = rq;
2046
2047 result = rbd_img_request_fill_bio(img_request, rq->bio);
2048 if (!result)
2049 result = rbd_img_request_submit(img_request);
2050 if (result)
2051 rbd_img_request_put(img_request);
2052end_request:
2053 spin_lock_irq(q->queue_lock);
2054 if (result < 0) {
2055 rbd_warn(rbd_dev, "obj_request %s result %d\n",
2056 write_request ? "write" : "read", result);
2057 __blk_end_request_all(rq, result);
2058 }
2059 }
2060}
2061
602adf40
YS
2062/*
2063 * a queue callback. Makes sure that we don't create a bio that spans across
2064 * multiple osd objects. One exception would be with a single page bios,
f7760dad 2065 * which we handle later at bio_chain_clone_range()
602adf40
YS
2066 */
2067static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2068 struct bio_vec *bvec)
2069{
2070 struct rbd_device *rbd_dev = q->queuedata;
e5cfeed2
AE
2071 sector_t sector_offset;
2072 sector_t sectors_per_obj;
2073 sector_t obj_sector_offset;
2074 int ret;
2075
2076 /*
2077 * Find how far into its rbd object the partition-relative
2078 * bio start sector is to offset relative to the enclosing
2079 * device.
2080 */
2081 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2082 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2083 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
2084
2085 /*
2086 * Compute the number of bytes from that offset to the end
2087 * of the object. Account for what's already used by the bio.
2088 */
2089 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2090 if (ret > bmd->bi_size)
2091 ret -= bmd->bi_size;
2092 else
2093 ret = 0;
2094
2095 /*
2096 * Don't send back more than was asked for. And if the bio
2097 * was empty, let the whole thing through because: "Note
2098 * that a block device *must* allow a single page to be
2099 * added to an empty bio."
2100 */
2101 rbd_assert(bvec->bv_len <= PAGE_SIZE);
2102 if (ret > (int) bvec->bv_len || !bmd->bi_size)
2103 ret = (int) bvec->bv_len;
2104
2105 return ret;
602adf40
YS
2106}
2107
2108static void rbd_free_disk(struct rbd_device *rbd_dev)
2109{
2110 struct gendisk *disk = rbd_dev->disk;
2111
2112 if (!disk)
2113 return;
2114
602adf40
YS
2115 if (disk->flags & GENHD_FL_UP)
2116 del_gendisk(disk);
2117 if (disk->queue)
2118 blk_cleanup_queue(disk->queue);
2119 put_disk(disk);
2120}
2121
788e2df3
AE
2122static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2123 const char *object_name,
2124 u64 offset, u64 length,
2125 char *buf, u64 *version)
2126
2127{
2128 struct ceph_osd_req_op *op;
2129 struct rbd_obj_request *obj_request;
2130 struct ceph_osd_client *osdc;
2131 struct page **pages = NULL;
2132 u32 page_count;
1ceae7ef 2133 size_t size;
788e2df3
AE
2134 int ret;
2135
2136 page_count = (u32) calc_pages_for(offset, length);
2137 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2138 if (IS_ERR(pages))
2139 ret = PTR_ERR(pages);
2140
2141 ret = -ENOMEM;
2142 obj_request = rbd_obj_request_create(object_name, offset, length,
36be9a76 2143 OBJ_REQUEST_PAGES);
788e2df3
AE
2144 if (!obj_request)
2145 goto out;
2146
2147 obj_request->pages = pages;
2148 obj_request->page_count = page_count;
2149
2150 op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length);
2151 if (!op)
2152 goto out;
2153 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2154 obj_request, op);
2155 rbd_osd_req_op_destroy(op);
2156 if (!obj_request->osd_req)
2157 goto out;
2158
2159 osdc = &rbd_dev->rbd_client->client->osdc;
2160 ret = rbd_obj_request_submit(osdc, obj_request);
2161 if (ret)
2162 goto out;
2163 ret = rbd_obj_request_wait(obj_request);
2164 if (ret)
2165 goto out;
2166
2167 ret = obj_request->result;
2168 if (ret < 0)
2169 goto out;
1ceae7ef
AE
2170
2171 rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
2172 size = (size_t) obj_request->xferred;
903bb32e 2173 ceph_copy_from_page_vector(pages, buf, 0, size);
23ed6e13
AE
2174 rbd_assert(size <= (size_t) INT_MAX);
2175 ret = (int) size;
788e2df3
AE
2176 if (version)
2177 *version = obj_request->version;
2178out:
2179 if (obj_request)
2180 rbd_obj_request_put(obj_request);
2181 else
2182 ceph_release_page_vector(pages, page_count);
2183
2184 return ret;
2185}
2186
602adf40 2187/*
4156d998
AE
2188 * Read the complete header for the given rbd device.
2189 *
2190 * Returns a pointer to a dynamically-allocated buffer containing
2191 * the complete and validated header. Caller can pass the address
2192 * of a variable that will be filled in with the version of the
2193 * header object at the time it was read.
2194 *
2195 * Returns a pointer-coded errno if a failure occurs.
602adf40 2196 */
4156d998
AE
2197static struct rbd_image_header_ondisk *
2198rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
602adf40 2199{
4156d998 2200 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 2201 u32 snap_count = 0;
4156d998
AE
2202 u64 names_size = 0;
2203 u32 want_count;
2204 int ret;
602adf40 2205
00f1f36f 2206 /*
4156d998
AE
2207 * The complete header will include an array of its 64-bit
2208 * snapshot ids, followed by the names of those snapshots as
2209 * a contiguous block of NUL-terminated strings. Note that
2210 * the number of snapshots could change by the time we read
2211 * it in, in which case we re-read it.
00f1f36f 2212 */
4156d998
AE
2213 do {
2214 size_t size;
2215
2216 kfree(ondisk);
2217
2218 size = sizeof (*ondisk);
2219 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2220 size += names_size;
2221 ondisk = kmalloc(size, GFP_KERNEL);
2222 if (!ondisk)
2223 return ERR_PTR(-ENOMEM);
2224
788e2df3 2225 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
4156d998
AE
2226 0, size,
2227 (char *) ondisk, version);
4156d998
AE
2228 if (ret < 0)
2229 goto out_err;
2230 if (WARN_ON((size_t) ret < size)) {
2231 ret = -ENXIO;
06ecc6cb
AE
2232 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2233 size, ret);
4156d998
AE
2234 goto out_err;
2235 }
2236 if (!rbd_dev_ondisk_valid(ondisk)) {
2237 ret = -ENXIO;
06ecc6cb 2238 rbd_warn(rbd_dev, "invalid header");
4156d998 2239 goto out_err;
81e759fb 2240 }
602adf40 2241
4156d998
AE
2242 names_size = le64_to_cpu(ondisk->snap_names_len);
2243 want_count = snap_count;
2244 snap_count = le32_to_cpu(ondisk->snap_count);
2245 } while (snap_count != want_count);
00f1f36f 2246
4156d998 2247 return ondisk;
00f1f36f 2248
4156d998
AE
2249out_err:
2250 kfree(ondisk);
2251
2252 return ERR_PTR(ret);
2253}
2254
2255/*
2256 * reload the ondisk the header
2257 */
2258static int rbd_read_header(struct rbd_device *rbd_dev,
2259 struct rbd_image_header *header)
2260{
2261 struct rbd_image_header_ondisk *ondisk;
2262 u64 ver = 0;
2263 int ret;
602adf40 2264
4156d998
AE
2265 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
2266 if (IS_ERR(ondisk))
2267 return PTR_ERR(ondisk);
2268 ret = rbd_header_from_disk(header, ondisk);
2269 if (ret >= 0)
2270 header->obj_version = ver;
2271 kfree(ondisk);
2272
2273 return ret;
602adf40
YS
2274}
2275
41f38c2b 2276static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
dfc5606d
YS
2277{
2278 struct rbd_snap *snap;
a0593290 2279 struct rbd_snap *next;
dfc5606d 2280
a0593290 2281 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
41f38c2b 2282 rbd_remove_snap_dev(snap);
dfc5606d
YS
2283}
2284
9478554a
AE
2285static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
2286{
2287 sector_t size;
2288
0d7dbfce 2289 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
9478554a
AE
2290 return;
2291
2292 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
2293 dout("setting size to %llu sectors", (unsigned long long) size);
2294 rbd_dev->mapping.size = (u64) size;
2295 set_capacity(rbd_dev->disk, size);
2296}
2297
602adf40
YS
2298/*
2299 * only read the first part of the ondisk header, without the snaps info
2300 */
117973fb 2301static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
602adf40
YS
2302{
2303 int ret;
2304 struct rbd_image_header h;
602adf40
YS
2305
2306 ret = rbd_read_header(rbd_dev, &h);
2307 if (ret < 0)
2308 return ret;
2309
a51aa0c0
JD
2310 down_write(&rbd_dev->header_rwsem);
2311
9478554a
AE
2312 /* Update image size, and check for resize of mapped image */
2313 rbd_dev->header.image_size = h.image_size;
2314 rbd_update_mapping_size(rbd_dev);
9db4b3e3 2315
849b4260 2316 /* rbd_dev->header.object_prefix shouldn't change */
602adf40 2317 kfree(rbd_dev->header.snap_sizes);
849b4260 2318 kfree(rbd_dev->header.snap_names);
d1d25646
JD
2319 /* osd requests may still refer to snapc */
2320 ceph_put_snap_context(rbd_dev->header.snapc);
602adf40 2321
b813623a
AE
2322 if (hver)
2323 *hver = h.obj_version;
a71b891b 2324 rbd_dev->header.obj_version = h.obj_version;
93a24e08 2325 rbd_dev->header.image_size = h.image_size;
602adf40
YS
2326 rbd_dev->header.snapc = h.snapc;
2327 rbd_dev->header.snap_names = h.snap_names;
2328 rbd_dev->header.snap_sizes = h.snap_sizes;
849b4260
AE
2329 /* Free the extra copy of the object prefix */
2330 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2331 kfree(h.object_prefix);
2332
304f6808
AE
2333 ret = rbd_dev_snaps_update(rbd_dev);
2334 if (!ret)
2335 ret = rbd_dev_snaps_register(rbd_dev);
dfc5606d 2336
c666601a 2337 up_write(&rbd_dev->header_rwsem);
602adf40 2338
dfc5606d 2339 return ret;
602adf40
YS
2340}
2341
117973fb 2342static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
1fe5e993
AE
2343{
2344 int ret;
2345
117973fb 2346 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1fe5e993 2347 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
117973fb
AE
2348 if (rbd_dev->image_format == 1)
2349 ret = rbd_dev_v1_refresh(rbd_dev, hver);
2350 else
2351 ret = rbd_dev_v2_refresh(rbd_dev, hver);
1fe5e993
AE
2352 mutex_unlock(&ctl_mutex);
2353
2354 return ret;
2355}
2356
602adf40
YS
2357static int rbd_init_disk(struct rbd_device *rbd_dev)
2358{
2359 struct gendisk *disk;
2360 struct request_queue *q;
593a9e7b 2361 u64 segment_size;
602adf40 2362
602adf40 2363 /* create gendisk info */
602adf40
YS
2364 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2365 if (!disk)
1fcdb8aa 2366 return -ENOMEM;
602adf40 2367
f0f8cef5 2368 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 2369 rbd_dev->dev_id);
602adf40
YS
2370 disk->major = rbd_dev->major;
2371 disk->first_minor = 0;
2372 disk->fops = &rbd_bd_ops;
2373 disk->private_data = rbd_dev;
2374
bf0d5f50 2375 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
602adf40
YS
2376 if (!q)
2377 goto out_disk;
029bcbd8 2378
593a9e7b
AE
2379 /* We use the default size, but let's be explicit about it. */
2380 blk_queue_physical_block_size(q, SECTOR_SIZE);
2381
029bcbd8 2382 /* set io sizes to object size */
593a9e7b
AE
2383 segment_size = rbd_obj_bytes(&rbd_dev->header);
2384 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2385 blk_queue_max_segment_size(q, segment_size);
2386 blk_queue_io_min(q, segment_size);
2387 blk_queue_io_opt(q, segment_size);
029bcbd8 2388
602adf40
YS
2389 blk_queue_merge_bvec(q, rbd_merge_bvec);
2390 disk->queue = q;
2391
2392 q->queuedata = rbd_dev;
2393
2394 rbd_dev->disk = disk;
602adf40 2395
12f02944
AE
2396 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
2397
602adf40 2398 return 0;
602adf40
YS
2399out_disk:
2400 put_disk(disk);
1fcdb8aa
AE
2401
2402 return -ENOMEM;
602adf40
YS
2403}
2404
dfc5606d
YS
2405/*
2406 sysfs
2407*/
2408
593a9e7b
AE
2409static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2410{
2411 return container_of(dev, struct rbd_device, dev);
2412}
2413
dfc5606d
YS
2414static ssize_t rbd_size_show(struct device *dev,
2415 struct device_attribute *attr, char *buf)
2416{
593a9e7b 2417 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0
JD
2418 sector_t size;
2419
2420 down_read(&rbd_dev->header_rwsem);
2421 size = get_capacity(rbd_dev->disk);
2422 up_read(&rbd_dev->header_rwsem);
dfc5606d 2423
a51aa0c0 2424 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
dfc5606d
YS
2425}
2426
34b13184
AE
2427/*
2428 * Note this shows the features for whatever's mapped, which is not
2429 * necessarily the base image.
2430 */
2431static ssize_t rbd_features_show(struct device *dev,
2432 struct device_attribute *attr, char *buf)
2433{
2434 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2435
2436 return sprintf(buf, "0x%016llx\n",
2437 (unsigned long long) rbd_dev->mapping.features);
2438}
2439
dfc5606d
YS
2440static ssize_t rbd_major_show(struct device *dev,
2441 struct device_attribute *attr, char *buf)
2442{
593a9e7b 2443 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 2444
dfc5606d
YS
2445 return sprintf(buf, "%d\n", rbd_dev->major);
2446}
2447
2448static ssize_t rbd_client_id_show(struct device *dev,
2449 struct device_attribute *attr, char *buf)
602adf40 2450{
593a9e7b 2451 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2452
1dbb4399
AE
2453 return sprintf(buf, "client%lld\n",
2454 ceph_client_id(rbd_dev->rbd_client->client));
602adf40
YS
2455}
2456
dfc5606d
YS
2457static ssize_t rbd_pool_show(struct device *dev,
2458 struct device_attribute *attr, char *buf)
602adf40 2459{
593a9e7b 2460 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2461
0d7dbfce 2462 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
dfc5606d
YS
2463}
2464
9bb2f334
AE
2465static ssize_t rbd_pool_id_show(struct device *dev,
2466 struct device_attribute *attr, char *buf)
2467{
2468 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2469
0d7dbfce
AE
2470 return sprintf(buf, "%llu\n",
2471 (unsigned long long) rbd_dev->spec->pool_id);
9bb2f334
AE
2472}
2473
dfc5606d
YS
2474static ssize_t rbd_name_show(struct device *dev,
2475 struct device_attribute *attr, char *buf)
2476{
593a9e7b 2477 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2478
a92ffdf8
AE
2479 if (rbd_dev->spec->image_name)
2480 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2481
2482 return sprintf(buf, "(unknown)\n");
dfc5606d
YS
2483}
2484
589d30e0
AE
2485static ssize_t rbd_image_id_show(struct device *dev,
2486 struct device_attribute *attr, char *buf)
2487{
2488 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2489
0d7dbfce 2490 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
589d30e0
AE
2491}
2492
34b13184
AE
2493/*
2494 * Shows the name of the currently-mapped snapshot (or
2495 * RBD_SNAP_HEAD_NAME for the base image).
2496 */
dfc5606d
YS
2497static ssize_t rbd_snap_show(struct device *dev,
2498 struct device_attribute *attr,
2499 char *buf)
2500{
593a9e7b 2501 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2502
0d7dbfce 2503 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
dfc5606d
YS
2504}
2505
86b00e0d
AE
2506/*
2507 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2508 * for the parent image. If there is no parent, simply shows
2509 * "(no parent image)".
2510 */
2511static ssize_t rbd_parent_show(struct device *dev,
2512 struct device_attribute *attr,
2513 char *buf)
2514{
2515 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2516 struct rbd_spec *spec = rbd_dev->parent_spec;
2517 int count;
2518 char *bufp = buf;
2519
2520 if (!spec)
2521 return sprintf(buf, "(no parent image)\n");
2522
2523 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2524 (unsigned long long) spec->pool_id, spec->pool_name);
2525 if (count < 0)
2526 return count;
2527 bufp += count;
2528
2529 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2530 spec->image_name ? spec->image_name : "(unknown)");
2531 if (count < 0)
2532 return count;
2533 bufp += count;
2534
2535 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2536 (unsigned long long) spec->snap_id, spec->snap_name);
2537 if (count < 0)
2538 return count;
2539 bufp += count;
2540
2541 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2542 if (count < 0)
2543 return count;
2544 bufp += count;
2545
2546 return (ssize_t) (bufp - buf);
2547}
2548
dfc5606d
YS
2549static ssize_t rbd_image_refresh(struct device *dev,
2550 struct device_attribute *attr,
2551 const char *buf,
2552 size_t size)
2553{
593a9e7b 2554 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 2555 int ret;
602adf40 2556
117973fb 2557 ret = rbd_dev_refresh(rbd_dev, NULL);
b813623a
AE
2558
2559 return ret < 0 ? ret : size;
dfc5606d 2560}
602adf40 2561
dfc5606d 2562static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
34b13184 2563static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
dfc5606d
YS
2564static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2565static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2566static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
9bb2f334 2567static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
dfc5606d 2568static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
589d30e0 2569static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
dfc5606d
YS
2570static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2571static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
86b00e0d 2572static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
dfc5606d
YS
2573
2574static struct attribute *rbd_attrs[] = {
2575 &dev_attr_size.attr,
34b13184 2576 &dev_attr_features.attr,
dfc5606d
YS
2577 &dev_attr_major.attr,
2578 &dev_attr_client_id.attr,
2579 &dev_attr_pool.attr,
9bb2f334 2580 &dev_attr_pool_id.attr,
dfc5606d 2581 &dev_attr_name.attr,
589d30e0 2582 &dev_attr_image_id.attr,
dfc5606d 2583 &dev_attr_current_snap.attr,
86b00e0d 2584 &dev_attr_parent.attr,
dfc5606d 2585 &dev_attr_refresh.attr,
dfc5606d
YS
2586 NULL
2587};
2588
2589static struct attribute_group rbd_attr_group = {
2590 .attrs = rbd_attrs,
2591};
2592
2593static const struct attribute_group *rbd_attr_groups[] = {
2594 &rbd_attr_group,
2595 NULL
2596};
2597
2598static void rbd_sysfs_dev_release(struct device *dev)
2599{
2600}
2601
2602static struct device_type rbd_device_type = {
2603 .name = "rbd",
2604 .groups = rbd_attr_groups,
2605 .release = rbd_sysfs_dev_release,
2606};
2607
2608
2609/*
2610 sysfs - snapshots
2611*/
2612
2613static ssize_t rbd_snap_size_show(struct device *dev,
2614 struct device_attribute *attr,
2615 char *buf)
2616{
2617 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2618
3591538f 2619 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
dfc5606d
YS
2620}
2621
2622static ssize_t rbd_snap_id_show(struct device *dev,
2623 struct device_attribute *attr,
2624 char *buf)
2625{
2626 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2627
3591538f 2628 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
dfc5606d
YS
2629}
2630
34b13184
AE
2631static ssize_t rbd_snap_features_show(struct device *dev,
2632 struct device_attribute *attr,
2633 char *buf)
2634{
2635 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2636
2637 return sprintf(buf, "0x%016llx\n",
2638 (unsigned long long) snap->features);
2639}
2640
dfc5606d
YS
2641static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2642static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
34b13184 2643static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
dfc5606d
YS
2644
2645static struct attribute *rbd_snap_attrs[] = {
2646 &dev_attr_snap_size.attr,
2647 &dev_attr_snap_id.attr,
34b13184 2648 &dev_attr_snap_features.attr,
dfc5606d
YS
2649 NULL,
2650};
2651
2652static struct attribute_group rbd_snap_attr_group = {
2653 .attrs = rbd_snap_attrs,
2654};
2655
2656static void rbd_snap_dev_release(struct device *dev)
2657{
2658 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2659 kfree(snap->name);
2660 kfree(snap);
2661}
2662
2663static const struct attribute_group *rbd_snap_attr_groups[] = {
2664 &rbd_snap_attr_group,
2665 NULL
2666};
2667
2668static struct device_type rbd_snap_device_type = {
2669 .groups = rbd_snap_attr_groups,
2670 .release = rbd_snap_dev_release,
2671};
2672
8b8fb99c
AE
2673static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2674{
2675 kref_get(&spec->kref);
2676
2677 return spec;
2678}
2679
2680static void rbd_spec_free(struct kref *kref);
2681static void rbd_spec_put(struct rbd_spec *spec)
2682{
2683 if (spec)
2684 kref_put(&spec->kref, rbd_spec_free);
2685}
2686
2687static struct rbd_spec *rbd_spec_alloc(void)
2688{
2689 struct rbd_spec *spec;
2690
2691 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2692 if (!spec)
2693 return NULL;
2694 kref_init(&spec->kref);
2695
2696 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2697
2698 return spec;
2699}
2700
2701static void rbd_spec_free(struct kref *kref)
2702{
2703 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2704
2705 kfree(spec->pool_name);
2706 kfree(spec->image_id);
2707 kfree(spec->image_name);
2708 kfree(spec->snap_name);
2709 kfree(spec);
2710}
2711
cc344fa1 2712static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
c53d5893
AE
2713 struct rbd_spec *spec)
2714{
2715 struct rbd_device *rbd_dev;
2716
2717 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2718 if (!rbd_dev)
2719 return NULL;
2720
2721 spin_lock_init(&rbd_dev->lock);
6d292906 2722 rbd_dev->flags = 0;
c53d5893
AE
2723 INIT_LIST_HEAD(&rbd_dev->node);
2724 INIT_LIST_HEAD(&rbd_dev->snaps);
2725 init_rwsem(&rbd_dev->header_rwsem);
2726
2727 rbd_dev->spec = spec;
2728 rbd_dev->rbd_client = rbdc;
2729
0903e875
AE
2730 /* Initialize the layout used for all rbd requests */
2731
2732 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2733 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2734 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2735 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2736
c53d5893
AE
2737 return rbd_dev;
2738}
2739
2740static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2741{
86b00e0d 2742 rbd_spec_put(rbd_dev->parent_spec);
c53d5893
AE
2743 kfree(rbd_dev->header_name);
2744 rbd_put_client(rbd_dev->rbd_client);
2745 rbd_spec_put(rbd_dev->spec);
2746 kfree(rbd_dev);
2747}
2748
304f6808
AE
2749static bool rbd_snap_registered(struct rbd_snap *snap)
2750{
2751 bool ret = snap->dev.type == &rbd_snap_device_type;
2752 bool reg = device_is_registered(&snap->dev);
2753
2754 rbd_assert(!ret ^ reg);
2755
2756 return ret;
2757}
2758
41f38c2b 2759static void rbd_remove_snap_dev(struct rbd_snap *snap)
dfc5606d
YS
2760{
2761 list_del(&snap->node);
304f6808
AE
2762 if (device_is_registered(&snap->dev))
2763 device_unregister(&snap->dev);
dfc5606d
YS
2764}
2765
14e7085d 2766static int rbd_register_snap_dev(struct rbd_snap *snap,
dfc5606d
YS
2767 struct device *parent)
2768{
2769 struct device *dev = &snap->dev;
2770 int ret;
2771
2772 dev->type = &rbd_snap_device_type;
2773 dev->parent = parent;
2774 dev->release = rbd_snap_dev_release;
d4b125e9 2775 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
304f6808
AE
2776 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2777
dfc5606d
YS
2778 ret = device_register(dev);
2779
2780 return ret;
2781}
2782
4e891e0a 2783static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
c8d18425 2784 const char *snap_name,
34b13184
AE
2785 u64 snap_id, u64 snap_size,
2786 u64 snap_features)
dfc5606d 2787{
4e891e0a 2788 struct rbd_snap *snap;
dfc5606d 2789 int ret;
4e891e0a
AE
2790
2791 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
dfc5606d 2792 if (!snap)
4e891e0a
AE
2793 return ERR_PTR(-ENOMEM);
2794
2795 ret = -ENOMEM;
c8d18425 2796 snap->name = kstrdup(snap_name, GFP_KERNEL);
4e891e0a
AE
2797 if (!snap->name)
2798 goto err;
2799
c8d18425
AE
2800 snap->id = snap_id;
2801 snap->size = snap_size;
34b13184 2802 snap->features = snap_features;
4e891e0a
AE
2803
2804 return snap;
2805
dfc5606d
YS
2806err:
2807 kfree(snap->name);
2808 kfree(snap);
4e891e0a
AE
2809
2810 return ERR_PTR(ret);
dfc5606d
YS
2811}
2812
cd892126
AE
2813static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2814 u64 *snap_size, u64 *snap_features)
2815{
2816 char *snap_name;
2817
2818 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2819
2820 *snap_size = rbd_dev->header.snap_sizes[which];
2821 *snap_features = 0; /* No features for v1 */
2822
2823 /* Skip over names until we find the one we are looking for */
2824
2825 snap_name = rbd_dev->header.snap_names;
2826 while (which--)
2827 snap_name += strlen(snap_name) + 1;
2828
2829 return snap_name;
2830}
2831
9d475de5
AE
2832/*
2833 * Get the size and object order for an image snapshot, or if
2834 * snap_id is CEPH_NOSNAP, gets this information for the base
2835 * image.
2836 */
2837static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2838 u8 *order, u64 *snap_size)
2839{
2840 __le64 snapid = cpu_to_le64(snap_id);
2841 int ret;
2842 struct {
2843 u8 order;
2844 __le64 size;
2845 } __attribute__ ((packed)) size_buf = { 0 };
2846
36be9a76 2847 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
9d475de5
AE
2848 "rbd", "get_size",
2849 (char *) &snapid, sizeof (snapid),
07b2391f 2850 (char *) &size_buf, sizeof (size_buf), NULL);
36be9a76 2851 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
9d475de5
AE
2852 if (ret < 0)
2853 return ret;
2854
2855 *order = size_buf.order;
2856 *snap_size = le64_to_cpu(size_buf.size);
2857
2858 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2859 (unsigned long long) snap_id, (unsigned int) *order,
2860 (unsigned long long) *snap_size);
2861
2862 return 0;
2863}
2864
2865static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2866{
2867 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2868 &rbd_dev->header.obj_order,
2869 &rbd_dev->header.image_size);
2870}
2871
1e130199
AE
2872static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2873{
2874 void *reply_buf;
2875 int ret;
2876 void *p;
2877
2878 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2879 if (!reply_buf)
2880 return -ENOMEM;
2881
36be9a76 2882 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
1e130199
AE
2883 "rbd", "get_object_prefix",
2884 NULL, 0,
07b2391f 2885 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
36be9a76 2886 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
1e130199
AE
2887 if (ret < 0)
2888 goto out;
2889
2890 p = reply_buf;
2891 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2892 p + RBD_OBJ_PREFIX_LEN_MAX,
2893 NULL, GFP_NOIO);
2894
2895 if (IS_ERR(rbd_dev->header.object_prefix)) {
2896 ret = PTR_ERR(rbd_dev->header.object_prefix);
2897 rbd_dev->header.object_prefix = NULL;
2898 } else {
2899 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2900 }
2901
2902out:
2903 kfree(reply_buf);
2904
2905 return ret;
2906}
2907
b1b5402a
AE
2908static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2909 u64 *snap_features)
2910{
2911 __le64 snapid = cpu_to_le64(snap_id);
2912 struct {
2913 __le64 features;
2914 __le64 incompat;
2915 } features_buf = { 0 };
d889140c 2916 u64 incompat;
b1b5402a
AE
2917 int ret;
2918
36be9a76 2919 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
b1b5402a
AE
2920 "rbd", "get_features",
2921 (char *) &snapid, sizeof (snapid),
2922 (char *) &features_buf, sizeof (features_buf),
07b2391f 2923 NULL);
36be9a76 2924 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
b1b5402a
AE
2925 if (ret < 0)
2926 return ret;
d889140c
AE
2927
2928 incompat = le64_to_cpu(features_buf.incompat);
2929 if (incompat & ~RBD_FEATURES_ALL)
b8f5c6ed 2930 return -ENXIO;
d889140c 2931
b1b5402a
AE
2932 *snap_features = le64_to_cpu(features_buf.features);
2933
2934 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2935 (unsigned long long) snap_id,
2936 (unsigned long long) *snap_features,
2937 (unsigned long long) le64_to_cpu(features_buf.incompat));
2938
2939 return 0;
2940}
2941
2942static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2943{
2944 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2945 &rbd_dev->header.features);
2946}
2947
86b00e0d
AE
2948static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2949{
2950 struct rbd_spec *parent_spec;
2951 size_t size;
2952 void *reply_buf = NULL;
2953 __le64 snapid;
2954 void *p;
2955 void *end;
2956 char *image_id;
2957 u64 overlap;
86b00e0d
AE
2958 int ret;
2959
2960 parent_spec = rbd_spec_alloc();
2961 if (!parent_spec)
2962 return -ENOMEM;
2963
2964 size = sizeof (__le64) + /* pool_id */
2965 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2966 sizeof (__le64) + /* snap_id */
2967 sizeof (__le64); /* overlap */
2968 reply_buf = kmalloc(size, GFP_KERNEL);
2969 if (!reply_buf) {
2970 ret = -ENOMEM;
2971 goto out_err;
2972 }
2973
2974 snapid = cpu_to_le64(CEPH_NOSNAP);
36be9a76 2975 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
86b00e0d
AE
2976 "rbd", "get_parent",
2977 (char *) &snapid, sizeof (snapid),
07b2391f 2978 (char *) reply_buf, size, NULL);
36be9a76 2979 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
86b00e0d
AE
2980 if (ret < 0)
2981 goto out_err;
2982
2983 ret = -ERANGE;
2984 p = reply_buf;
2985 end = (char *) reply_buf + size;
2986 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2987 if (parent_spec->pool_id == CEPH_NOPOOL)
2988 goto out; /* No parent? No problem. */
2989
0903e875
AE
2990 /* The ceph file layout needs to fit pool id in 32 bits */
2991
2992 ret = -EIO;
2993 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
2994 goto out;
2995
979ed480 2996 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
86b00e0d
AE
2997 if (IS_ERR(image_id)) {
2998 ret = PTR_ERR(image_id);
2999 goto out_err;
3000 }
3001 parent_spec->image_id = image_id;
3002 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
3003 ceph_decode_64_safe(&p, end, overlap, out_err);
3004
3005 rbd_dev->parent_overlap = overlap;
3006 rbd_dev->parent_spec = parent_spec;
3007 parent_spec = NULL; /* rbd_dev now owns this */
3008out:
3009 ret = 0;
3010out_err:
3011 kfree(reply_buf);
3012 rbd_spec_put(parent_spec);
3013
3014 return ret;
3015}
3016
9e15b77d
AE
3017static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3018{
3019 size_t image_id_size;
3020 char *image_id;
3021 void *p;
3022 void *end;
3023 size_t size;
3024 void *reply_buf = NULL;
3025 size_t len = 0;
3026 char *image_name = NULL;
3027 int ret;
3028
3029 rbd_assert(!rbd_dev->spec->image_name);
3030
69e7a02f
AE
3031 len = strlen(rbd_dev->spec->image_id);
3032 image_id_size = sizeof (__le32) + len;
9e15b77d
AE
3033 image_id = kmalloc(image_id_size, GFP_KERNEL);
3034 if (!image_id)
3035 return NULL;
3036
3037 p = image_id;
3038 end = (char *) image_id + image_id_size;
69e7a02f 3039 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
9e15b77d
AE
3040
3041 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
3042 reply_buf = kmalloc(size, GFP_KERNEL);
3043 if (!reply_buf)
3044 goto out;
3045
36be9a76 3046 ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
9e15b77d
AE
3047 "rbd", "dir_get_name",
3048 image_id, image_id_size,
07b2391f 3049 (char *) reply_buf, size, NULL);
9e15b77d
AE
3050 if (ret < 0)
3051 goto out;
3052 p = reply_buf;
3053 end = (char *) reply_buf + size;
3054 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
3055 if (IS_ERR(image_name))
3056 image_name = NULL;
3057 else
3058 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
3059out:
3060 kfree(reply_buf);
3061 kfree(image_id);
3062
3063 return image_name;
3064}
3065
3066/*
3067 * When a parent image gets probed, we only have the pool, image,
3068 * and snapshot ids but not the names of any of them. This call
3069 * is made later to fill in those names. It has to be done after
3070 * rbd_dev_snaps_update() has completed because some of the
3071 * information (in particular, snapshot name) is not available
3072 * until then.
3073 */
3074static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
3075{
3076 struct ceph_osd_client *osdc;
3077 const char *name;
3078 void *reply_buf = NULL;
3079 int ret;
3080
3081 if (rbd_dev->spec->pool_name)
3082 return 0; /* Already have the names */
3083
3084 /* Look up the pool name */
3085
3086 osdc = &rbd_dev->rbd_client->client->osdc;
3087 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
935dc89f
AE
3088 if (!name) {
3089 rbd_warn(rbd_dev, "there is no pool with id %llu",
3090 rbd_dev->spec->pool_id); /* Really a BUG() */
3091 return -EIO;
3092 }
9e15b77d
AE
3093
3094 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
3095 if (!rbd_dev->spec->pool_name)
3096 return -ENOMEM;
3097
3098 /* Fetch the image name; tolerate failure here */
3099
3100 name = rbd_dev_image_name(rbd_dev);
69e7a02f 3101 if (name)
9e15b77d 3102 rbd_dev->spec->image_name = (char *) name;
69e7a02f 3103 else
06ecc6cb 3104 rbd_warn(rbd_dev, "unable to get image name");
9e15b77d
AE
3105
3106 /* Look up the snapshot name. */
3107
3108 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
3109 if (!name) {
935dc89f
AE
3110 rbd_warn(rbd_dev, "no snapshot with id %llu",
3111 rbd_dev->spec->snap_id); /* Really a BUG() */
9e15b77d
AE
3112 ret = -EIO;
3113 goto out_err;
3114 }
3115 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
3116 if(!rbd_dev->spec->snap_name)
3117 goto out_err;
3118
3119 return 0;
3120out_err:
3121 kfree(reply_buf);
3122 kfree(rbd_dev->spec->pool_name);
3123 rbd_dev->spec->pool_name = NULL;
3124
3125 return ret;
3126}
3127
6e14b1a6 3128static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
35d489f9
AE
3129{
3130 size_t size;
3131 int ret;
3132 void *reply_buf;
3133 void *p;
3134 void *end;
3135 u64 seq;
3136 u32 snap_count;
3137 struct ceph_snap_context *snapc;
3138 u32 i;
3139
3140 /*
3141 * We'll need room for the seq value (maximum snapshot id),
3142 * snapshot count, and array of that many snapshot ids.
3143 * For now we have a fixed upper limit on the number we're
3144 * prepared to receive.
3145 */
3146 size = sizeof (__le64) + sizeof (__le32) +
3147 RBD_MAX_SNAP_COUNT * sizeof (__le64);
3148 reply_buf = kzalloc(size, GFP_KERNEL);
3149 if (!reply_buf)
3150 return -ENOMEM;
3151
36be9a76 3152 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
35d489f9
AE
3153 "rbd", "get_snapcontext",
3154 NULL, 0,
07b2391f 3155 reply_buf, size, ver);
36be9a76 3156 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
35d489f9
AE
3157 if (ret < 0)
3158 goto out;
3159
3160 ret = -ERANGE;
3161 p = reply_buf;
3162 end = (char *) reply_buf + size;
3163 ceph_decode_64_safe(&p, end, seq, out);
3164 ceph_decode_32_safe(&p, end, snap_count, out);
3165
3166 /*
3167 * Make sure the reported number of snapshot ids wouldn't go
3168 * beyond the end of our buffer. But before checking that,
3169 * make sure the computed size of the snapshot context we
3170 * allocate is representable in a size_t.
3171 */
3172 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3173 / sizeof (u64)) {
3174 ret = -EINVAL;
3175 goto out;
3176 }
3177 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3178 goto out;
3179
3180 size = sizeof (struct ceph_snap_context) +
3181 snap_count * sizeof (snapc->snaps[0]);
3182 snapc = kmalloc(size, GFP_KERNEL);
3183 if (!snapc) {
3184 ret = -ENOMEM;
3185 goto out;
3186 }
3187
3188 atomic_set(&snapc->nref, 1);
3189 snapc->seq = seq;
3190 snapc->num_snaps = snap_count;
3191 for (i = 0; i < snap_count; i++)
3192 snapc->snaps[i] = ceph_decode_64(&p);
3193
3194 rbd_dev->header.snapc = snapc;
3195
3196 dout(" snap context seq = %llu, snap_count = %u\n",
3197 (unsigned long long) seq, (unsigned int) snap_count);
3198
3199out:
3200 kfree(reply_buf);
3201
3202 return 0;
3203}
3204
b8b1e2db
AE
3205static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3206{
3207 size_t size;
3208 void *reply_buf;
3209 __le64 snap_id;
3210 int ret;
3211 void *p;
3212 void *end;
b8b1e2db
AE
3213 char *snap_name;
3214
3215 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3216 reply_buf = kmalloc(size, GFP_KERNEL);
3217 if (!reply_buf)
3218 return ERR_PTR(-ENOMEM);
3219
3220 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
36be9a76 3221 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
b8b1e2db
AE
3222 "rbd", "get_snapshot_name",
3223 (char *) &snap_id, sizeof (snap_id),
07b2391f 3224 reply_buf, size, NULL);
36be9a76 3225 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
b8b1e2db
AE
3226 if (ret < 0)
3227 goto out;
3228
3229 p = reply_buf;
3230 end = (char *) reply_buf + size;
e5c35534 3231 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
b8b1e2db
AE
3232 if (IS_ERR(snap_name)) {
3233 ret = PTR_ERR(snap_name);
3234 goto out;
3235 } else {
3236 dout(" snap_id 0x%016llx snap_name = %s\n",
3237 (unsigned long long) le64_to_cpu(snap_id), snap_name);
3238 }
3239 kfree(reply_buf);
3240
3241 return snap_name;
3242out:
3243 kfree(reply_buf);
3244
3245 return ERR_PTR(ret);
3246}
3247
3248static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3249 u64 *snap_size, u64 *snap_features)
3250{
e0b49868 3251 u64 snap_id;
b8b1e2db
AE
3252 u8 order;
3253 int ret;
3254
3255 snap_id = rbd_dev->header.snapc->snaps[which];
3256 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3257 if (ret)
3258 return ERR_PTR(ret);
3259 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3260 if (ret)
3261 return ERR_PTR(ret);
3262
3263 return rbd_dev_v2_snap_name(rbd_dev, which);
3264}
3265
3266static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3267 u64 *snap_size, u64 *snap_features)
3268{
3269 if (rbd_dev->image_format == 1)
3270 return rbd_dev_v1_snap_info(rbd_dev, which,
3271 snap_size, snap_features);
3272 if (rbd_dev->image_format == 2)
3273 return rbd_dev_v2_snap_info(rbd_dev, which,
3274 snap_size, snap_features);
3275 return ERR_PTR(-EINVAL);
3276}
3277
117973fb
AE
3278static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3279{
3280 int ret;
3281 __u8 obj_order;
3282
3283 down_write(&rbd_dev->header_rwsem);
3284
3285 /* Grab old order first, to see if it changes */
3286
3287 obj_order = rbd_dev->header.obj_order,
3288 ret = rbd_dev_v2_image_size(rbd_dev);
3289 if (ret)
3290 goto out;
3291 if (rbd_dev->header.obj_order != obj_order) {
3292 ret = -EIO;
3293 goto out;
3294 }
3295 rbd_update_mapping_size(rbd_dev);
3296
3297 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3298 dout("rbd_dev_v2_snap_context returned %d\n", ret);
3299 if (ret)
3300 goto out;
3301 ret = rbd_dev_snaps_update(rbd_dev);
3302 dout("rbd_dev_snaps_update returned %d\n", ret);
3303 if (ret)
3304 goto out;
3305 ret = rbd_dev_snaps_register(rbd_dev);
3306 dout("rbd_dev_snaps_register returned %d\n", ret);
3307out:
3308 up_write(&rbd_dev->header_rwsem);
3309
3310 return ret;
3311}
3312
dfc5606d 3313/*
35938150
AE
3314 * Scan the rbd device's current snapshot list and compare it to the
3315 * newly-received snapshot context. Remove any existing snapshots
3316 * not present in the new snapshot context. Add a new snapshot for
3317 * any snaphots in the snapshot context not in the current list.
3318 * And verify there are no changes to snapshots we already know
3319 * about.
3320 *
3321 * Assumes the snapshots in the snapshot context are sorted by
3322 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
3323 * are also maintained in that order.)
dfc5606d 3324 */
304f6808 3325static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
dfc5606d 3326{
35938150
AE
3327 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3328 const u32 snap_count = snapc->num_snaps;
35938150
AE
3329 struct list_head *head = &rbd_dev->snaps;
3330 struct list_head *links = head->next;
3331 u32 index = 0;
dfc5606d 3332
9fcbb800 3333 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
35938150
AE
3334 while (index < snap_count || links != head) {
3335 u64 snap_id;
3336 struct rbd_snap *snap;
cd892126
AE
3337 char *snap_name;
3338 u64 snap_size = 0;
3339 u64 snap_features = 0;
dfc5606d 3340
35938150
AE
3341 snap_id = index < snap_count ? snapc->snaps[index]
3342 : CEPH_NOSNAP;
3343 snap = links != head ? list_entry(links, struct rbd_snap, node)
3344 : NULL;
aafb230e 3345 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
dfc5606d 3346
35938150
AE
3347 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
3348 struct list_head *next = links->next;
dfc5606d 3349
6d292906
AE
3350 /*
3351 * A previously-existing snapshot is not in
3352 * the new snap context.
3353 *
3354 * If the now missing snapshot is the one the
3355 * image is mapped to, clear its exists flag
3356 * so we can avoid sending any more requests
3357 * to it.
3358 */
0d7dbfce 3359 if (rbd_dev->spec->snap_id == snap->id)
6d292906 3360 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
41f38c2b 3361 rbd_remove_snap_dev(snap);
9fcbb800 3362 dout("%ssnap id %llu has been removed\n",
0d7dbfce
AE
3363 rbd_dev->spec->snap_id == snap->id ?
3364 "mapped " : "",
9fcbb800 3365 (unsigned long long) snap->id);
35938150
AE
3366
3367 /* Done with this list entry; advance */
3368
3369 links = next;
dfc5606d
YS
3370 continue;
3371 }
35938150 3372
b8b1e2db
AE
3373 snap_name = rbd_dev_snap_info(rbd_dev, index,
3374 &snap_size, &snap_features);
cd892126
AE
3375 if (IS_ERR(snap_name))
3376 return PTR_ERR(snap_name);
3377
9fcbb800
AE
3378 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
3379 (unsigned long long) snap_id);
35938150
AE
3380 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
3381 struct rbd_snap *new_snap;
3382
3383 /* We haven't seen this snapshot before */
3384
c8d18425 3385 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
cd892126 3386 snap_id, snap_size, snap_features);
9fcbb800
AE
3387 if (IS_ERR(new_snap)) {
3388 int err = PTR_ERR(new_snap);
3389
3390 dout(" failed to add dev, error %d\n", err);
3391
3392 return err;
3393 }
35938150
AE
3394
3395 /* New goes before existing, or at end of list */
3396
9fcbb800 3397 dout(" added dev%s\n", snap ? "" : " at end\n");
35938150
AE
3398 if (snap)
3399 list_add_tail(&new_snap->node, &snap->node);
3400 else
523f3258 3401 list_add_tail(&new_snap->node, head);
35938150
AE
3402 } else {
3403 /* Already have this one */
3404
9fcbb800
AE
3405 dout(" already present\n");
3406
cd892126 3407 rbd_assert(snap->size == snap_size);
aafb230e 3408 rbd_assert(!strcmp(snap->name, snap_name));
cd892126 3409 rbd_assert(snap->features == snap_features);
35938150
AE
3410
3411 /* Done with this list entry; advance */
3412
3413 links = links->next;
dfc5606d 3414 }
35938150
AE
3415
3416 /* Advance to the next entry in the snapshot context */
3417
3418 index++;
dfc5606d 3419 }
9fcbb800 3420 dout("%s: done\n", __func__);
dfc5606d
YS
3421
3422 return 0;
3423}
3424
304f6808
AE
3425/*
3426 * Scan the list of snapshots and register the devices for any that
3427 * have not already been registered.
3428 */
3429static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3430{
3431 struct rbd_snap *snap;
3432 int ret = 0;
3433
37206ee5 3434 dout("%s:\n", __func__);
86ff77bb
AE
3435 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
3436 return -EIO;
304f6808
AE
3437
3438 list_for_each_entry(snap, &rbd_dev->snaps, node) {
3439 if (!rbd_snap_registered(snap)) {
3440 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3441 if (ret < 0)
3442 break;
3443 }
3444 }
3445 dout("%s: returning %d\n", __func__, ret);
3446
3447 return ret;
3448}
3449
dfc5606d
YS
3450static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3451{
dfc5606d 3452 struct device *dev;
cd789ab9 3453 int ret;
dfc5606d
YS
3454
3455 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
dfc5606d 3456
cd789ab9 3457 dev = &rbd_dev->dev;
dfc5606d
YS
3458 dev->bus = &rbd_bus_type;
3459 dev->type = &rbd_device_type;
3460 dev->parent = &rbd_root_dev;
3461 dev->release = rbd_dev_release;
de71a297 3462 dev_set_name(dev, "%d", rbd_dev->dev_id);
dfc5606d 3463 ret = device_register(dev);
dfc5606d 3464
dfc5606d 3465 mutex_unlock(&ctl_mutex);
cd789ab9 3466
dfc5606d 3467 return ret;
602adf40
YS
3468}
3469
dfc5606d
YS
3470static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3471{
3472 device_unregister(&rbd_dev->dev);
3473}
3474
e2839308 3475static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
1ddbe94e
AE
3476
3477/*
499afd5b
AE
3478 * Get a unique rbd identifier for the given new rbd_dev, and add
3479 * the rbd_dev to the global list. The minimum rbd id is 1.
1ddbe94e 3480 */
e2839308 3481static void rbd_dev_id_get(struct rbd_device *rbd_dev)
b7f23c36 3482{
e2839308 3483 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
499afd5b
AE
3484
3485 spin_lock(&rbd_dev_list_lock);
3486 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3487 spin_unlock(&rbd_dev_list_lock);
e2839308
AE
3488 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3489 (unsigned long long) rbd_dev->dev_id);
1ddbe94e 3490}
b7f23c36 3491
1ddbe94e 3492/*
499afd5b
AE
3493 * Remove an rbd_dev from the global list, and record that its
3494 * identifier is no longer in use.
1ddbe94e 3495 */
e2839308 3496static void rbd_dev_id_put(struct rbd_device *rbd_dev)
1ddbe94e 3497{
d184f6bf 3498 struct list_head *tmp;
de71a297 3499 int rbd_id = rbd_dev->dev_id;
d184f6bf
AE
3500 int max_id;
3501
aafb230e 3502 rbd_assert(rbd_id > 0);
499afd5b 3503
e2839308
AE
3504 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3505 (unsigned long long) rbd_dev->dev_id);
499afd5b
AE
3506 spin_lock(&rbd_dev_list_lock);
3507 list_del_init(&rbd_dev->node);
d184f6bf
AE
3508
3509 /*
3510 * If the id being "put" is not the current maximum, there
3511 * is nothing special we need to do.
3512 */
e2839308 3513 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
d184f6bf
AE
3514 spin_unlock(&rbd_dev_list_lock);
3515 return;
3516 }
3517
3518 /*
3519 * We need to update the current maximum id. Search the
3520 * list to find out what it is. We're more likely to find
3521 * the maximum at the end, so search the list backward.
3522 */
3523 max_id = 0;
3524 list_for_each_prev(tmp, &rbd_dev_list) {
3525 struct rbd_device *rbd_dev;
3526
3527 rbd_dev = list_entry(tmp, struct rbd_device, node);
b213e0b1
AE
3528 if (rbd_dev->dev_id > max_id)
3529 max_id = rbd_dev->dev_id;
d184f6bf 3530 }
499afd5b 3531 spin_unlock(&rbd_dev_list_lock);
b7f23c36 3532
1ddbe94e 3533 /*
e2839308 3534 * The max id could have been updated by rbd_dev_id_get(), in
d184f6bf
AE
3535 * which case it now accurately reflects the new maximum.
3536 * Be careful not to overwrite the maximum value in that
3537 * case.
1ddbe94e 3538 */
e2839308
AE
3539 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3540 dout(" max dev id has been reset\n");
b7f23c36
AE
3541}
3542
e28fff26
AE
3543/*
3544 * Skips over white space at *buf, and updates *buf to point to the
3545 * first found non-space character (if any). Returns the length of
593a9e7b
AE
3546 * the token (string of non-white space characters) found. Note
3547 * that *buf must be terminated with '\0'.
e28fff26
AE
3548 */
3549static inline size_t next_token(const char **buf)
3550{
3551 /*
3552 * These are the characters that produce nonzero for
3553 * isspace() in the "C" and "POSIX" locales.
3554 */
3555 const char *spaces = " \f\n\r\t\v";
3556
3557 *buf += strspn(*buf, spaces); /* Find start of token */
3558
3559 return strcspn(*buf, spaces); /* Return token length */
3560}
3561
3562/*
3563 * Finds the next token in *buf, and if the provided token buffer is
3564 * big enough, copies the found token into it. The result, if
593a9e7b
AE
3565 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3566 * must be terminated with '\0' on entry.
e28fff26
AE
3567 *
3568 * Returns the length of the token found (not including the '\0').
3569 * Return value will be 0 if no token is found, and it will be >=
3570 * token_size if the token would not fit.
3571 *
593a9e7b 3572 * The *buf pointer will be updated to point beyond the end of the
e28fff26
AE
3573 * found token. Note that this occurs even if the token buffer is
3574 * too small to hold it.
3575 */
3576static inline size_t copy_token(const char **buf,
3577 char *token,
3578 size_t token_size)
3579{
3580 size_t len;
3581
3582 len = next_token(buf);
3583 if (len < token_size) {
3584 memcpy(token, *buf, len);
3585 *(token + len) = '\0';
3586 }
3587 *buf += len;
3588
3589 return len;
3590}
3591
ea3352f4
AE
3592/*
3593 * Finds the next token in *buf, dynamically allocates a buffer big
3594 * enough to hold a copy of it, and copies the token into the new
3595 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3596 * that a duplicate buffer is created even for a zero-length token.
3597 *
3598 * Returns a pointer to the newly-allocated duplicate, or a null
3599 * pointer if memory for the duplicate was not available. If
3600 * the lenp argument is a non-null pointer, the length of the token
3601 * (not including the '\0') is returned in *lenp.
3602 *
3603 * If successful, the *buf pointer will be updated to point beyond
3604 * the end of the found token.
3605 *
3606 * Note: uses GFP_KERNEL for allocation.
3607 */
3608static inline char *dup_token(const char **buf, size_t *lenp)
3609{
3610 char *dup;
3611 size_t len;
3612
3613 len = next_token(buf);
4caf35f9 3614 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
ea3352f4
AE
3615 if (!dup)
3616 return NULL;
ea3352f4
AE
3617 *(dup + len) = '\0';
3618 *buf += len;
3619
3620 if (lenp)
3621 *lenp = len;
3622
3623 return dup;
3624}
3625
a725f65e 3626/*
859c31df
AE
3627 * Parse the options provided for an "rbd add" (i.e., rbd image
3628 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3629 * and the data written is passed here via a NUL-terminated buffer.
3630 * Returns 0 if successful or an error code otherwise.
d22f76e7 3631 *
859c31df
AE
3632 * The information extracted from these options is recorded in
3633 * the other parameters which return dynamically-allocated
3634 * structures:
3635 * ceph_opts
3636 * The address of a pointer that will refer to a ceph options
3637 * structure. Caller must release the returned pointer using
3638 * ceph_destroy_options() when it is no longer needed.
3639 * rbd_opts
3640 * Address of an rbd options pointer. Fully initialized by
3641 * this function; caller must release with kfree().
3642 * spec
3643 * Address of an rbd image specification pointer. Fully
3644 * initialized by this function based on parsed options.
3645 * Caller must release with rbd_spec_put().
3646 *
3647 * The options passed take this form:
3648 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3649 * where:
3650 * <mon_addrs>
3651 * A comma-separated list of one or more monitor addresses.
3652 * A monitor address is an ip address, optionally followed
3653 * by a port number (separated by a colon).
3654 * I.e.: ip1[:port1][,ip2[:port2]...]
3655 * <options>
3656 * A comma-separated list of ceph and/or rbd options.
3657 * <pool_name>
3658 * The name of the rados pool containing the rbd image.
3659 * <image_name>
3660 * The name of the image in that pool to map.
3661 * <snap_id>
3662 * An optional snapshot id. If provided, the mapping will
3663 * present data from the image at the time that snapshot was
3664 * created. The image head is used if no snapshot id is
3665 * provided. Snapshot mappings are always read-only.
a725f65e 3666 */
859c31df 3667static int rbd_add_parse_args(const char *buf,
dc79b113 3668 struct ceph_options **ceph_opts,
859c31df
AE
3669 struct rbd_options **opts,
3670 struct rbd_spec **rbd_spec)
e28fff26 3671{
d22f76e7 3672 size_t len;
859c31df 3673 char *options;
0ddebc0c
AE
3674 const char *mon_addrs;
3675 size_t mon_addrs_size;
859c31df 3676 struct rbd_spec *spec = NULL;
4e9afeba 3677 struct rbd_options *rbd_opts = NULL;
859c31df 3678 struct ceph_options *copts;
dc79b113 3679 int ret;
e28fff26
AE
3680
3681 /* The first four tokens are required */
3682
7ef3214a 3683 len = next_token(&buf);
4fb5d671
AE
3684 if (!len) {
3685 rbd_warn(NULL, "no monitor address(es) provided");
3686 return -EINVAL;
3687 }
0ddebc0c 3688 mon_addrs = buf;
f28e565a 3689 mon_addrs_size = len + 1;
7ef3214a 3690 buf += len;
a725f65e 3691
dc79b113 3692 ret = -EINVAL;
f28e565a
AE
3693 options = dup_token(&buf, NULL);
3694 if (!options)
dc79b113 3695 return -ENOMEM;
4fb5d671
AE
3696 if (!*options) {
3697 rbd_warn(NULL, "no options provided");
3698 goto out_err;
3699 }
e28fff26 3700
859c31df
AE
3701 spec = rbd_spec_alloc();
3702 if (!spec)
f28e565a 3703 goto out_mem;
859c31df
AE
3704
3705 spec->pool_name = dup_token(&buf, NULL);
3706 if (!spec->pool_name)
3707 goto out_mem;
4fb5d671
AE
3708 if (!*spec->pool_name) {
3709 rbd_warn(NULL, "no pool name provided");
3710 goto out_err;
3711 }
e28fff26 3712
69e7a02f 3713 spec->image_name = dup_token(&buf, NULL);
859c31df 3714 if (!spec->image_name)
f28e565a 3715 goto out_mem;
4fb5d671
AE
3716 if (!*spec->image_name) {
3717 rbd_warn(NULL, "no image name provided");
3718 goto out_err;
3719 }
d4b125e9 3720
f28e565a
AE
3721 /*
3722 * Snapshot name is optional; default is to use "-"
3723 * (indicating the head/no snapshot).
3724 */
3feeb894 3725 len = next_token(&buf);
820a5f3e 3726 if (!len) {
3feeb894
AE
3727 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3728 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
f28e565a 3729 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
dc79b113 3730 ret = -ENAMETOOLONG;
f28e565a 3731 goto out_err;
849b4260 3732 }
4caf35f9 3733 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
859c31df 3734 if (!spec->snap_name)
f28e565a 3735 goto out_mem;
859c31df 3736 *(spec->snap_name + len) = '\0';
e5c35534 3737
0ddebc0c 3738 /* Initialize all rbd options to the defaults */
e28fff26 3739
4e9afeba
AE
3740 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3741 if (!rbd_opts)
3742 goto out_mem;
3743
3744 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
d22f76e7 3745
859c31df 3746 copts = ceph_parse_options(options, mon_addrs,
0ddebc0c 3747 mon_addrs + mon_addrs_size - 1,
4e9afeba 3748 parse_rbd_opts_token, rbd_opts);
859c31df
AE
3749 if (IS_ERR(copts)) {
3750 ret = PTR_ERR(copts);
dc79b113
AE
3751 goto out_err;
3752 }
859c31df
AE
3753 kfree(options);
3754
3755 *ceph_opts = copts;
4e9afeba 3756 *opts = rbd_opts;
859c31df 3757 *rbd_spec = spec;
0ddebc0c 3758
dc79b113 3759 return 0;
f28e565a 3760out_mem:
dc79b113 3761 ret = -ENOMEM;
d22f76e7 3762out_err:
859c31df
AE
3763 kfree(rbd_opts);
3764 rbd_spec_put(spec);
f28e565a 3765 kfree(options);
d22f76e7 3766
dc79b113 3767 return ret;
a725f65e
AE
3768}
3769
589d30e0
AE
3770/*
3771 * An rbd format 2 image has a unique identifier, distinct from the
3772 * name given to it by the user. Internally, that identifier is
3773 * what's used to specify the names of objects related to the image.
3774 *
3775 * A special "rbd id" object is used to map an rbd image name to its
3776 * id. If that object doesn't exist, then there is no v2 rbd image
3777 * with the supplied name.
3778 *
3779 * This function will record the given rbd_dev's image_id field if
3780 * it can be determined, and in that case will return 0. If any
3781 * errors occur a negative errno will be returned and the rbd_dev's
3782 * image_id field will be unchanged (and should be NULL).
3783 */
3784static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3785{
3786 int ret;
3787 size_t size;
3788 char *object_name;
3789 void *response;
3790 void *p;
3791
2c0d0a10
AE
3792 /*
3793 * When probing a parent image, the image id is already
3794 * known (and the image name likely is not). There's no
3795 * need to fetch the image id again in this case.
3796 */
3797 if (rbd_dev->spec->image_id)
3798 return 0;
3799
589d30e0
AE
3800 /*
3801 * First, see if the format 2 image id file exists, and if
3802 * so, get the image's persistent id from it.
3803 */
69e7a02f 3804 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
589d30e0
AE
3805 object_name = kmalloc(size, GFP_NOIO);
3806 if (!object_name)
3807 return -ENOMEM;
0d7dbfce 3808 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
589d30e0
AE
3809 dout("rbd id object name is %s\n", object_name);
3810
3811 /* Response will be an encoded string, which includes a length */
3812
3813 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3814 response = kzalloc(size, GFP_NOIO);
3815 if (!response) {
3816 ret = -ENOMEM;
3817 goto out;
3818 }
3819
36be9a76 3820 ret = rbd_obj_method_sync(rbd_dev, object_name,
589d30e0
AE
3821 "rbd", "get_id",
3822 NULL, 0,
07b2391f 3823 response, RBD_IMAGE_ID_LEN_MAX, NULL);
36be9a76 3824 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
589d30e0
AE
3825 if (ret < 0)
3826 goto out;
3827
3828 p = response;
0d7dbfce 3829 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
589d30e0 3830 p + RBD_IMAGE_ID_LEN_MAX,
979ed480 3831 NULL, GFP_NOIO);
0d7dbfce
AE
3832 if (IS_ERR(rbd_dev->spec->image_id)) {
3833 ret = PTR_ERR(rbd_dev->spec->image_id);
3834 rbd_dev->spec->image_id = NULL;
589d30e0 3835 } else {
0d7dbfce 3836 dout("image_id is %s\n", rbd_dev->spec->image_id);
589d30e0
AE
3837 }
3838out:
3839 kfree(response);
3840 kfree(object_name);
3841
3842 return ret;
3843}
3844
a30b71b9
AE
3845static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3846{
3847 int ret;
3848 size_t size;
3849
3850 /* Version 1 images have no id; empty string is used */
3851
0d7dbfce
AE
3852 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3853 if (!rbd_dev->spec->image_id)
a30b71b9 3854 return -ENOMEM;
a30b71b9
AE
3855
3856 /* Record the header object name for this rbd image. */
3857
69e7a02f 3858 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
a30b71b9
AE
3859 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3860 if (!rbd_dev->header_name) {
3861 ret = -ENOMEM;
3862 goto out_err;
3863 }
0d7dbfce
AE
3864 sprintf(rbd_dev->header_name, "%s%s",
3865 rbd_dev->spec->image_name, RBD_SUFFIX);
a30b71b9
AE
3866
3867 /* Populate rbd image metadata */
3868
3869 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3870 if (ret < 0)
3871 goto out_err;
86b00e0d
AE
3872
3873 /* Version 1 images have no parent (no layering) */
3874
3875 rbd_dev->parent_spec = NULL;
3876 rbd_dev->parent_overlap = 0;
3877
a30b71b9
AE
3878 rbd_dev->image_format = 1;
3879
3880 dout("discovered version 1 image, header name is %s\n",
3881 rbd_dev->header_name);
3882
3883 return 0;
3884
3885out_err:
3886 kfree(rbd_dev->header_name);
3887 rbd_dev->header_name = NULL;
0d7dbfce
AE
3888 kfree(rbd_dev->spec->image_id);
3889 rbd_dev->spec->image_id = NULL;
a30b71b9
AE
3890
3891 return ret;
3892}
3893
3894static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3895{
3896 size_t size;
9d475de5 3897 int ret;
6e14b1a6 3898 u64 ver = 0;
a30b71b9
AE
3899
3900 /*
3901 * Image id was filled in by the caller. Record the header
3902 * object name for this rbd image.
3903 */
979ed480 3904 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
a30b71b9
AE
3905 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3906 if (!rbd_dev->header_name)
3907 return -ENOMEM;
3908 sprintf(rbd_dev->header_name, "%s%s",
0d7dbfce 3909 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
9d475de5
AE
3910
3911 /* Get the size and object order for the image */
3912
3913 ret = rbd_dev_v2_image_size(rbd_dev);
1e130199
AE
3914 if (ret < 0)
3915 goto out_err;
3916
3917 /* Get the object prefix (a.k.a. block_name) for the image */
3918
3919 ret = rbd_dev_v2_object_prefix(rbd_dev);
b1b5402a
AE
3920 if (ret < 0)
3921 goto out_err;
3922
d889140c 3923 /* Get the and check features for the image */
b1b5402a
AE
3924
3925 ret = rbd_dev_v2_features(rbd_dev);
9d475de5
AE
3926 if (ret < 0)
3927 goto out_err;
35d489f9 3928
86b00e0d
AE
3929 /* If the image supports layering, get the parent info */
3930
3931 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3932 ret = rbd_dev_v2_parent_info(rbd_dev);
3933 if (ret < 0)
3934 goto out_err;
3935 }
3936
6e14b1a6
AE
3937 /* crypto and compression type aren't (yet) supported for v2 images */
3938
3939 rbd_dev->header.crypt_type = 0;
3940 rbd_dev->header.comp_type = 0;
35d489f9 3941
6e14b1a6
AE
3942 /* Get the snapshot context, plus the header version */
3943
3944 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
35d489f9
AE
3945 if (ret)
3946 goto out_err;
6e14b1a6
AE
3947 rbd_dev->header.obj_version = ver;
3948
a30b71b9
AE
3949 rbd_dev->image_format = 2;
3950
3951 dout("discovered version 2 image, header name is %s\n",
3952 rbd_dev->header_name);
3953
35152979 3954 return 0;
9d475de5 3955out_err:
86b00e0d
AE
3956 rbd_dev->parent_overlap = 0;
3957 rbd_spec_put(rbd_dev->parent_spec);
3958 rbd_dev->parent_spec = NULL;
9d475de5
AE
3959 kfree(rbd_dev->header_name);
3960 rbd_dev->header_name = NULL;
1e130199
AE
3961 kfree(rbd_dev->header.object_prefix);
3962 rbd_dev->header.object_prefix = NULL;
9d475de5
AE
3963
3964 return ret;
a30b71b9
AE
3965}
3966
83a06263
AE
3967static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3968{
3969 int ret;
3970
3971 /* no need to lock here, as rbd_dev is not registered yet */
3972 ret = rbd_dev_snaps_update(rbd_dev);
3973 if (ret)
3974 return ret;
3975
9e15b77d
AE
3976 ret = rbd_dev_probe_update_spec(rbd_dev);
3977 if (ret)
3978 goto err_out_snaps;
3979
83a06263
AE
3980 ret = rbd_dev_set_mapping(rbd_dev);
3981 if (ret)
3982 goto err_out_snaps;
3983
3984 /* generate unique id: find highest unique id, add one */
3985 rbd_dev_id_get(rbd_dev);
3986
3987 /* Fill in the device name, now that we have its id. */
3988 BUILD_BUG_ON(DEV_NAME_LEN
3989 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3990 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3991
3992 /* Get our block major device number. */
3993
3994 ret = register_blkdev(0, rbd_dev->name);
3995 if (ret < 0)
3996 goto err_out_id;
3997 rbd_dev->major = ret;
3998
3999 /* Set up the blkdev mapping. */
4000
4001 ret = rbd_init_disk(rbd_dev);
4002 if (ret)
4003 goto err_out_blkdev;
4004
4005 ret = rbd_bus_add_dev(rbd_dev);
4006 if (ret)
4007 goto err_out_disk;
4008
4009 /*
4010 * At this point cleanup in the event of an error is the job
4011 * of the sysfs code (initiated by rbd_bus_del_dev()).
4012 */
4013 down_write(&rbd_dev->header_rwsem);
4014 ret = rbd_dev_snaps_register(rbd_dev);
4015 up_write(&rbd_dev->header_rwsem);
4016 if (ret)
4017 goto err_out_bus;
4018
9969ebc5 4019 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
83a06263
AE
4020 if (ret)
4021 goto err_out_bus;
4022
4023 /* Everything's ready. Announce the disk to the world. */
4024
4025 add_disk(rbd_dev->disk);
4026
4027 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4028 (unsigned long long) rbd_dev->mapping.size);
4029
4030 return ret;
4031err_out_bus:
4032 /* this will also clean up rest of rbd_dev stuff */
4033
4034 rbd_bus_del_dev(rbd_dev);
4035
4036 return ret;
4037err_out_disk:
4038 rbd_free_disk(rbd_dev);
4039err_out_blkdev:
4040 unregister_blkdev(rbd_dev->major, rbd_dev->name);
4041err_out_id:
4042 rbd_dev_id_put(rbd_dev);
4043err_out_snaps:
4044 rbd_remove_all_snaps(rbd_dev);
4045
4046 return ret;
4047}
4048
a30b71b9
AE
4049/*
4050 * Probe for the existence of the header object for the given rbd
4051 * device. For format 2 images this includes determining the image
4052 * id.
4053 */
4054static int rbd_dev_probe(struct rbd_device *rbd_dev)
4055{
4056 int ret;
4057
4058 /*
4059 * Get the id from the image id object. If it's not a
4060 * format 2 image, we'll get ENOENT back, and we'll assume
4061 * it's a format 1 image.
4062 */
4063 ret = rbd_dev_image_id(rbd_dev);
4064 if (ret)
4065 ret = rbd_dev_v1_probe(rbd_dev);
4066 else
4067 ret = rbd_dev_v2_probe(rbd_dev);
83a06263 4068 if (ret) {
a30b71b9
AE
4069 dout("probe failed, returning %d\n", ret);
4070
83a06263
AE
4071 return ret;
4072 }
4073
4074 ret = rbd_dev_probe_finish(rbd_dev);
4075 if (ret)
4076 rbd_header_free(&rbd_dev->header);
4077
a30b71b9
AE
4078 return ret;
4079}
4080
59c2be1e
YS
4081static ssize_t rbd_add(struct bus_type *bus,
4082 const char *buf,
4083 size_t count)
602adf40 4084{
cb8627c7 4085 struct rbd_device *rbd_dev = NULL;
dc79b113 4086 struct ceph_options *ceph_opts = NULL;
4e9afeba 4087 struct rbd_options *rbd_opts = NULL;
859c31df 4088 struct rbd_spec *spec = NULL;
9d3997fd 4089 struct rbd_client *rbdc;
27cc2594
AE
4090 struct ceph_osd_client *osdc;
4091 int rc = -ENOMEM;
602adf40
YS
4092
4093 if (!try_module_get(THIS_MODULE))
4094 return -ENODEV;
4095
602adf40 4096 /* parse add command */
859c31df 4097 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
dc79b113 4098 if (rc < 0)
bd4ba655 4099 goto err_out_module;
78cea76e 4100
9d3997fd
AE
4101 rbdc = rbd_get_client(ceph_opts);
4102 if (IS_ERR(rbdc)) {
4103 rc = PTR_ERR(rbdc);
0ddebc0c 4104 goto err_out_args;
9d3997fd 4105 }
c53d5893 4106 ceph_opts = NULL; /* rbd_dev client now owns this */
602adf40 4107
602adf40 4108 /* pick the pool */
9d3997fd 4109 osdc = &rbdc->client->osdc;
859c31df 4110 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
602adf40
YS
4111 if (rc < 0)
4112 goto err_out_client;
859c31df
AE
4113 spec->pool_id = (u64) rc;
4114
0903e875
AE
4115 /* The ceph file layout needs to fit pool id in 32 bits */
4116
4117 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
4118 rc = -EIO;
4119 goto err_out_client;
4120 }
4121
c53d5893 4122 rbd_dev = rbd_dev_create(rbdc, spec);
bd4ba655
AE
4123 if (!rbd_dev)
4124 goto err_out_client;
c53d5893
AE
4125 rbdc = NULL; /* rbd_dev now owns this */
4126 spec = NULL; /* rbd_dev now owns this */
602adf40 4127
bd4ba655 4128 rbd_dev->mapping.read_only = rbd_opts->read_only;
c53d5893
AE
4129 kfree(rbd_opts);
4130 rbd_opts = NULL; /* done with this */
bd4ba655 4131
a30b71b9
AE
4132 rc = rbd_dev_probe(rbd_dev);
4133 if (rc < 0)
c53d5893 4134 goto err_out_rbd_dev;
05fd6f6f 4135
602adf40 4136 return count;
c53d5893
AE
4137err_out_rbd_dev:
4138 rbd_dev_destroy(rbd_dev);
bd4ba655 4139err_out_client:
9d3997fd 4140 rbd_put_client(rbdc);
0ddebc0c 4141err_out_args:
78cea76e
AE
4142 if (ceph_opts)
4143 ceph_destroy_options(ceph_opts);
4e9afeba 4144 kfree(rbd_opts);
859c31df 4145 rbd_spec_put(spec);
bd4ba655
AE
4146err_out_module:
4147 module_put(THIS_MODULE);
27cc2594 4148
602adf40 4149 dout("Error adding device %s\n", buf);
27cc2594
AE
4150
4151 return (ssize_t) rc;
602adf40
YS
4152}
4153
de71a297 4154static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
602adf40
YS
4155{
4156 struct list_head *tmp;
4157 struct rbd_device *rbd_dev;
4158
e124a82f 4159 spin_lock(&rbd_dev_list_lock);
602adf40
YS
4160 list_for_each(tmp, &rbd_dev_list) {
4161 rbd_dev = list_entry(tmp, struct rbd_device, node);
de71a297 4162 if (rbd_dev->dev_id == dev_id) {
e124a82f 4163 spin_unlock(&rbd_dev_list_lock);
602adf40 4164 return rbd_dev;
e124a82f 4165 }
602adf40 4166 }
e124a82f 4167 spin_unlock(&rbd_dev_list_lock);
602adf40
YS
4168 return NULL;
4169}
4170
dfc5606d 4171static void rbd_dev_release(struct device *dev)
602adf40 4172{
593a9e7b 4173 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 4174
59c2be1e 4175 if (rbd_dev->watch_event)
9969ebc5 4176 rbd_dev_header_watch_sync(rbd_dev, 0);
602adf40
YS
4177
4178 /* clean up and free blkdev */
4179 rbd_free_disk(rbd_dev);
4180 unregister_blkdev(rbd_dev->major, rbd_dev->name);
32eec68d 4181
2ac4e75d
AE
4182 /* release allocated disk header fields */
4183 rbd_header_free(&rbd_dev->header);
4184
32eec68d 4185 /* done with the id, and with the rbd_dev */
e2839308 4186 rbd_dev_id_put(rbd_dev);
c53d5893
AE
4187 rbd_assert(rbd_dev->rbd_client != NULL);
4188 rbd_dev_destroy(rbd_dev);
602adf40
YS
4189
4190 /* release module ref */
4191 module_put(THIS_MODULE);
602adf40
YS
4192}
4193
dfc5606d
YS
4194static ssize_t rbd_remove(struct bus_type *bus,
4195 const char *buf,
4196 size_t count)
602adf40
YS
4197{
4198 struct rbd_device *rbd_dev = NULL;
4199 int target_id, rc;
4200 unsigned long ul;
4201 int ret = count;
4202
4203 rc = strict_strtoul(buf, 10, &ul);
4204 if (rc)
4205 return rc;
4206
4207 /* convert to int; abort if we lost anything in the conversion */
4208 target_id = (int) ul;
4209 if (target_id != ul)
4210 return -EINVAL;
4211
4212 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4213
4214 rbd_dev = __rbd_get_dev(target_id);
4215 if (!rbd_dev) {
4216 ret = -ENOENT;
4217 goto done;
42382b70
AE
4218 }
4219
a14ea269 4220 spin_lock_irq(&rbd_dev->lock);
b82d167b 4221 if (rbd_dev->open_count)
42382b70 4222 ret = -EBUSY;
b82d167b
AE
4223 else
4224 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
a14ea269 4225 spin_unlock_irq(&rbd_dev->lock);
b82d167b 4226 if (ret < 0)
42382b70 4227 goto done;
602adf40 4228
41f38c2b 4229 rbd_remove_all_snaps(rbd_dev);
dfc5606d 4230 rbd_bus_del_dev(rbd_dev);
602adf40
YS
4231
4232done:
4233 mutex_unlock(&ctl_mutex);
aafb230e 4234
602adf40
YS
4235 return ret;
4236}
4237
602adf40
YS
4238/*
4239 * create control files in sysfs
dfc5606d 4240 * /sys/bus/rbd/...
602adf40
YS
4241 */
4242static int rbd_sysfs_init(void)
4243{
dfc5606d 4244 int ret;
602adf40 4245
fed4c143 4246 ret = device_register(&rbd_root_dev);
21079786 4247 if (ret < 0)
dfc5606d 4248 return ret;
602adf40 4249
fed4c143
AE
4250 ret = bus_register(&rbd_bus_type);
4251 if (ret < 0)
4252 device_unregister(&rbd_root_dev);
602adf40 4253
602adf40
YS
4254 return ret;
4255}
4256
4257static void rbd_sysfs_cleanup(void)
4258{
dfc5606d 4259 bus_unregister(&rbd_bus_type);
fed4c143 4260 device_unregister(&rbd_root_dev);
602adf40
YS
4261}
4262
cc344fa1 4263static int __init rbd_init(void)
602adf40
YS
4264{
4265 int rc;
4266
1e32d34c
AE
4267 if (!libceph_compatible(NULL)) {
4268 rbd_warn(NULL, "libceph incompatibility (quitting)");
4269
4270 return -EINVAL;
4271 }
602adf40
YS
4272 rc = rbd_sysfs_init();
4273 if (rc)
4274 return rc;
f0f8cef5 4275 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
602adf40
YS
4276 return 0;
4277}
4278
cc344fa1 4279static void __exit rbd_exit(void)
602adf40
YS
4280{
4281 rbd_sysfs_cleanup();
4282}
4283
4284module_init(rbd_init);
4285module_exit(rbd_exit);
4286
4287MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4288MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4289MODULE_DESCRIPTION("rados block device");
4290
4291/* following authorship retained from original osdblk.c */
4292MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4293
4294MODULE_LICENSE("GPL");