]> git.ipfire.org Git - people/arne_f/kernel.git/blame - drivers/block/rbd.c
rbd: define an rbd object request flags field
[people/arne_f/kernel.git] / drivers / block / rbd.c
CommitLineData
602adf40
YS
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
dfc5606d 24 For usage instructions, please refer to:
602adf40 25
dfc5606d 26 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
59c2be1e 34#include <linux/parser.h>
602adf40
YS
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
aafb230e
AE
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
593a9e7b
AE
46/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
f0f8cef5
AE
55#define RBD_DRV_NAME "rbd"
56#define RBD_DRV_NAME_LONG "rbd (rados block device)"
602adf40
YS
57
58#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
59
d4b125e9
AE
60#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
61#define RBD_MAX_SNAP_NAME_LEN \
62 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
63
35d489f9 64#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
602adf40
YS
65
66#define RBD_SNAP_HEAD_NAME "-"
67
9e15b77d
AE
68/* This allows a single page to hold an image name sent by OSD */
69#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
1e130199 70#define RBD_IMAGE_ID_LEN_MAX 64
9e15b77d 71
1e130199 72#define RBD_OBJ_PREFIX_LEN_MAX 64
589d30e0 73
d889140c
AE
74/* Feature bits */
75
5cbf6f12
AE
76#define RBD_FEATURE_LAYERING (1<<0)
77#define RBD_FEATURE_STRIPINGV2 (1<<1)
78#define RBD_FEATURES_ALL \
79 (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
d889140c
AE
80
81/* Features supported by this (client software) implementation. */
82
5cbf6f12 83#define RBD_FEATURES_SUPPORTED (0)
d889140c 84
81a89793
AE
85/*
86 * An RBD device name will be "rbd#", where the "rbd" comes from
87 * RBD_DRV_NAME above, and # is a unique integer identifier.
88 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
89 * enough to hold all possible device names.
90 */
602adf40 91#define DEV_NAME_LEN 32
81a89793 92#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
602adf40
YS
93
94/*
95 * block device image metadata (in-memory version)
96 */
97struct rbd_image_header {
f84344f3 98 /* These four fields never change for a given rbd image */
849b4260 99 char *object_prefix;
34b13184 100 u64 features;
602adf40
YS
101 __u8 obj_order;
102 __u8 crypt_type;
103 __u8 comp_type;
602adf40 104
f84344f3
AE
105 /* The remaining fields need to be updated occasionally */
106 u64 image_size;
107 struct ceph_snap_context *snapc;
602adf40
YS
108 char *snap_names;
109 u64 *snap_sizes;
59c2be1e
YS
110
111 u64 obj_version;
112};
113
0d7dbfce
AE
114/*
115 * An rbd image specification.
116 *
117 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
c66c6e0c
AE
118 * identify an image. Each rbd_dev structure includes a pointer to
119 * an rbd_spec structure that encapsulates this identity.
120 *
121 * Each of the id's in an rbd_spec has an associated name. For a
122 * user-mapped image, the names are supplied and the id's associated
123 * with them are looked up. For a layered image, a parent image is
124 * defined by the tuple, and the names are looked up.
125 *
126 * An rbd_dev structure contains a parent_spec pointer which is
127 * non-null if the image it represents is a child in a layered
128 * image. This pointer will refer to the rbd_spec structure used
129 * by the parent rbd_dev for its own identity (i.e., the structure
130 * is shared between the parent and child).
131 *
132 * Since these structures are populated once, during the discovery
133 * phase of image construction, they are effectively immutable so
134 * we make no effort to synchronize access to them.
135 *
136 * Note that code herein does not assume the image name is known (it
137 * could be a null pointer).
0d7dbfce
AE
138 */
139struct rbd_spec {
140 u64 pool_id;
141 char *pool_name;
142
143 char *image_id;
0d7dbfce 144 char *image_name;
0d7dbfce
AE
145
146 u64 snap_id;
147 char *snap_name;
148
149 struct kref kref;
150};
151
602adf40 152/*
f0f8cef5 153 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
154 */
155struct rbd_client {
156 struct ceph_client *client;
157 struct kref kref;
158 struct list_head node;
159};
160
bf0d5f50
AE
161struct rbd_img_request;
162typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
163
164#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
165
166struct rbd_obj_request;
167typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
168
9969ebc5
AE
169enum obj_request_type {
170 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
171};
bf0d5f50 172
926f9b3f
AE
173enum obj_req_flags {
174 OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */
175};
176
bf0d5f50
AE
177struct rbd_obj_request {
178 const char *object_name;
179 u64 offset; /* object start byte */
180 u64 length; /* bytes from offset */
926f9b3f 181 unsigned long flags;
bf0d5f50
AE
182
183 struct rbd_img_request *img_request;
7da22d29 184 u64 img_offset; /* image relative offset */
bf0d5f50
AE
185 struct list_head links; /* img_request->obj_requests */
186 u32 which; /* posn image request list */
187
188 enum obj_request_type type;
788e2df3
AE
189 union {
190 struct bio *bio_list;
191 struct {
192 struct page **pages;
193 u32 page_count;
194 };
195 };
bf0d5f50
AE
196
197 struct ceph_osd_request *osd_req;
198
199 u64 xferred; /* bytes transferred */
200 u64 version;
1b83bef2 201 int result;
bf0d5f50
AE
202
203 rbd_obj_callback_t callback;
788e2df3 204 struct completion completion;
bf0d5f50
AE
205
206 struct kref kref;
207};
208
0c425248 209enum img_req_flags {
9849e986
AE
210 IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */
211 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
d0b2e944 212 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
0c425248
AE
213};
214
bf0d5f50 215struct rbd_img_request {
bf0d5f50
AE
216 struct rbd_device *rbd_dev;
217 u64 offset; /* starting image byte offset */
218 u64 length; /* byte count from offset */
0c425248 219 unsigned long flags;
bf0d5f50 220 union {
9849e986 221 u64 snap_id; /* for reads */
bf0d5f50 222 struct ceph_snap_context *snapc; /* for writes */
9849e986
AE
223 };
224 union {
225 struct request *rq; /* block request */
226 struct rbd_obj_request *obj_request; /* obj req initiator */
bf0d5f50
AE
227 };
228 spinlock_t completion_lock;/* protects next_completion */
229 u32 next_completion;
230 rbd_img_callback_t callback;
55f27e09 231 u64 xferred;/* aggregate bytes transferred */
a5a337d4 232 int result; /* first nonzero obj_request result */
bf0d5f50
AE
233
234 u32 obj_request_count;
235 struct list_head obj_requests; /* rbd_obj_request structs */
236
237 struct kref kref;
238};
239
240#define for_each_obj_request(ireq, oreq) \
ef06f4d3 241 list_for_each_entry(oreq, &(ireq)->obj_requests, links)
bf0d5f50 242#define for_each_obj_request_from(ireq, oreq) \
ef06f4d3 243 list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
bf0d5f50 244#define for_each_obj_request_safe(ireq, oreq, n) \
ef06f4d3 245 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
bf0d5f50 246
dfc5606d
YS
247struct rbd_snap {
248 struct device dev;
249 const char *name;
3591538f 250 u64 size;
dfc5606d
YS
251 struct list_head node;
252 u64 id;
34b13184 253 u64 features;
dfc5606d
YS
254};
255
f84344f3 256struct rbd_mapping {
99c1f08f 257 u64 size;
34b13184 258 u64 features;
f84344f3
AE
259 bool read_only;
260};
261
602adf40
YS
262/*
263 * a single device
264 */
265struct rbd_device {
de71a297 266 int dev_id; /* blkdev unique id */
602adf40
YS
267
268 int major; /* blkdev assigned major */
269 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 270
a30b71b9 271 u32 image_format; /* Either 1 or 2 */
602adf40
YS
272 struct rbd_client *rbd_client;
273
274 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
275
b82d167b 276 spinlock_t lock; /* queue, flags, open_count */
602adf40
YS
277
278 struct rbd_image_header header;
b82d167b 279 unsigned long flags; /* possibly lock protected */
0d7dbfce 280 struct rbd_spec *spec;
602adf40 281
0d7dbfce 282 char *header_name;
971f839a 283
0903e875
AE
284 struct ceph_file_layout layout;
285
59c2be1e 286 struct ceph_osd_event *watch_event;
975241af 287 struct rbd_obj_request *watch_request;
59c2be1e 288
86b00e0d
AE
289 struct rbd_spec *parent_spec;
290 u64 parent_overlap;
291
c666601a
JD
292 /* protects updating the header */
293 struct rw_semaphore header_rwsem;
f84344f3
AE
294
295 struct rbd_mapping mapping;
602adf40
YS
296
297 struct list_head node;
dfc5606d
YS
298
299 /* list of snapshots */
300 struct list_head snaps;
301
302 /* sysfs related */
303 struct device dev;
b82d167b 304 unsigned long open_count; /* protected by lock */
dfc5606d
YS
305};
306
b82d167b
AE
307/*
308 * Flag bits for rbd_dev->flags. If atomicity is required,
309 * rbd_dev->lock is used to protect access.
310 *
311 * Currently, only the "removing" flag (which is coupled with the
312 * "open_count" field) requires atomic access.
313 */
6d292906
AE
314enum rbd_dev_flags {
315 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
b82d167b 316 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
6d292906
AE
317};
318
602adf40 319static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
e124a82f 320
602adf40 321static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
322static DEFINE_SPINLOCK(rbd_dev_list_lock);
323
432b8587
AE
324static LIST_HEAD(rbd_client_list); /* clients */
325static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 326
304f6808
AE
327static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
328static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
329
dfc5606d 330static void rbd_dev_release(struct device *dev);
41f38c2b 331static void rbd_remove_snap_dev(struct rbd_snap *snap);
dfc5606d 332
f0f8cef5
AE
333static ssize_t rbd_add(struct bus_type *bus, const char *buf,
334 size_t count);
335static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
336 size_t count);
337
338static struct bus_attribute rbd_bus_attrs[] = {
339 __ATTR(add, S_IWUSR, NULL, rbd_add),
340 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
341 __ATTR_NULL
342};
343
344static struct bus_type rbd_bus_type = {
345 .name = "rbd",
346 .bus_attrs = rbd_bus_attrs,
347};
348
349static void rbd_root_dev_release(struct device *dev)
350{
351}
352
353static struct device rbd_root_dev = {
354 .init_name = "rbd",
355 .release = rbd_root_dev_release,
356};
357
06ecc6cb
AE
358static __printf(2, 3)
359void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
360{
361 struct va_format vaf;
362 va_list args;
363
364 va_start(args, fmt);
365 vaf.fmt = fmt;
366 vaf.va = &args;
367
368 if (!rbd_dev)
369 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
370 else if (rbd_dev->disk)
371 printk(KERN_WARNING "%s: %s: %pV\n",
372 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
373 else if (rbd_dev->spec && rbd_dev->spec->image_name)
374 printk(KERN_WARNING "%s: image %s: %pV\n",
375 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
376 else if (rbd_dev->spec && rbd_dev->spec->image_id)
377 printk(KERN_WARNING "%s: id %s: %pV\n",
378 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
379 else /* punt */
380 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
381 RBD_DRV_NAME, rbd_dev, &vaf);
382 va_end(args);
383}
384
aafb230e
AE
385#ifdef RBD_DEBUG
386#define rbd_assert(expr) \
387 if (unlikely(!(expr))) { \
388 printk(KERN_ERR "\nAssertion failure in %s() " \
389 "at line %d:\n\n" \
390 "\trbd_assert(%s);\n\n", \
391 __func__, __LINE__, #expr); \
392 BUG(); \
393 }
394#else /* !RBD_DEBUG */
395# define rbd_assert(expr) ((void) 0)
396#endif /* !RBD_DEBUG */
dfc5606d 397
117973fb
AE
398static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
399static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
59c2be1e 400
602adf40
YS
401static int rbd_open(struct block_device *bdev, fmode_t mode)
402{
f0f8cef5 403 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
b82d167b 404 bool removing = false;
602adf40 405
f84344f3 406 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
602adf40
YS
407 return -EROFS;
408
a14ea269 409 spin_lock_irq(&rbd_dev->lock);
b82d167b
AE
410 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
411 removing = true;
412 else
413 rbd_dev->open_count++;
a14ea269 414 spin_unlock_irq(&rbd_dev->lock);
b82d167b
AE
415 if (removing)
416 return -ENOENT;
417
42382b70 418 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
c3e946ce 419 (void) get_device(&rbd_dev->dev);
f84344f3 420 set_device_ro(bdev, rbd_dev->mapping.read_only);
42382b70 421 mutex_unlock(&ctl_mutex);
340c7a2b 422
602adf40
YS
423 return 0;
424}
425
dfc5606d
YS
426static int rbd_release(struct gendisk *disk, fmode_t mode)
427{
428 struct rbd_device *rbd_dev = disk->private_data;
b82d167b
AE
429 unsigned long open_count_before;
430
a14ea269 431 spin_lock_irq(&rbd_dev->lock);
b82d167b 432 open_count_before = rbd_dev->open_count--;
a14ea269 433 spin_unlock_irq(&rbd_dev->lock);
b82d167b 434 rbd_assert(open_count_before > 0);
dfc5606d 435
42382b70 436 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
c3e946ce 437 put_device(&rbd_dev->dev);
42382b70 438 mutex_unlock(&ctl_mutex);
dfc5606d
YS
439
440 return 0;
441}
442
602adf40
YS
443static const struct block_device_operations rbd_bd_ops = {
444 .owner = THIS_MODULE,
445 .open = rbd_open,
dfc5606d 446 .release = rbd_release,
602adf40
YS
447};
448
449/*
450 * Initialize an rbd client instance.
43ae4701 451 * We own *ceph_opts.
602adf40 452 */
f8c38929 453static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
454{
455 struct rbd_client *rbdc;
456 int ret = -ENOMEM;
457
37206ee5 458 dout("%s:\n", __func__);
602adf40
YS
459 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
460 if (!rbdc)
461 goto out_opt;
462
463 kref_init(&rbdc->kref);
464 INIT_LIST_HEAD(&rbdc->node);
465
bc534d86
AE
466 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
467
43ae4701 468 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
602adf40 469 if (IS_ERR(rbdc->client))
bc534d86 470 goto out_mutex;
43ae4701 471 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
472
473 ret = ceph_open_session(rbdc->client);
474 if (ret < 0)
475 goto out_err;
476
432b8587 477 spin_lock(&rbd_client_list_lock);
602adf40 478 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 479 spin_unlock(&rbd_client_list_lock);
602adf40 480
bc534d86 481 mutex_unlock(&ctl_mutex);
37206ee5 482 dout("%s: rbdc %p\n", __func__, rbdc);
bc534d86 483
602adf40
YS
484 return rbdc;
485
486out_err:
487 ceph_destroy_client(rbdc->client);
bc534d86
AE
488out_mutex:
489 mutex_unlock(&ctl_mutex);
602adf40
YS
490 kfree(rbdc);
491out_opt:
43ae4701
AE
492 if (ceph_opts)
493 ceph_destroy_options(ceph_opts);
37206ee5
AE
494 dout("%s: error %d\n", __func__, ret);
495
28f259b7 496 return ERR_PTR(ret);
602adf40
YS
497}
498
499/*
1f7ba331
AE
500 * Find a ceph client with specific addr and configuration. If
501 * found, bump its reference count.
602adf40 502 */
1f7ba331 503static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
504{
505 struct rbd_client *client_node;
1f7ba331 506 bool found = false;
602adf40 507
43ae4701 508 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
509 return NULL;
510
1f7ba331
AE
511 spin_lock(&rbd_client_list_lock);
512 list_for_each_entry(client_node, &rbd_client_list, node) {
513 if (!ceph_compare_options(ceph_opts, client_node->client)) {
514 kref_get(&client_node->kref);
515 found = true;
516 break;
517 }
518 }
519 spin_unlock(&rbd_client_list_lock);
520
521 return found ? client_node : NULL;
602adf40
YS
522}
523
59c2be1e
YS
524/*
525 * mount options
526 */
527enum {
59c2be1e
YS
528 Opt_last_int,
529 /* int args above */
530 Opt_last_string,
531 /* string args above */
cc0538b6
AE
532 Opt_read_only,
533 Opt_read_write,
534 /* Boolean args above */
535 Opt_last_bool,
59c2be1e
YS
536};
537
43ae4701 538static match_table_t rbd_opts_tokens = {
59c2be1e
YS
539 /* int args above */
540 /* string args above */
be466c1c 541 {Opt_read_only, "read_only"},
cc0538b6
AE
542 {Opt_read_only, "ro"}, /* Alternate spelling */
543 {Opt_read_write, "read_write"},
544 {Opt_read_write, "rw"}, /* Alternate spelling */
545 /* Boolean args above */
59c2be1e
YS
546 {-1, NULL}
547};
548
98571b5a
AE
549struct rbd_options {
550 bool read_only;
551};
552
553#define RBD_READ_ONLY_DEFAULT false
554
59c2be1e
YS
555static int parse_rbd_opts_token(char *c, void *private)
556{
43ae4701 557 struct rbd_options *rbd_opts = private;
59c2be1e
YS
558 substring_t argstr[MAX_OPT_ARGS];
559 int token, intval, ret;
560
43ae4701 561 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
562 if (token < 0)
563 return -EINVAL;
564
565 if (token < Opt_last_int) {
566 ret = match_int(&argstr[0], &intval);
567 if (ret < 0) {
568 pr_err("bad mount option arg (not int) "
569 "at '%s'\n", c);
570 return ret;
571 }
572 dout("got int token %d val %d\n", token, intval);
573 } else if (token > Opt_last_int && token < Opt_last_string) {
574 dout("got string token %d val %s\n", token,
575 argstr[0].from);
cc0538b6
AE
576 } else if (token > Opt_last_string && token < Opt_last_bool) {
577 dout("got Boolean token %d\n", token);
59c2be1e
YS
578 } else {
579 dout("got token %d\n", token);
580 }
581
582 switch (token) {
cc0538b6
AE
583 case Opt_read_only:
584 rbd_opts->read_only = true;
585 break;
586 case Opt_read_write:
587 rbd_opts->read_only = false;
588 break;
59c2be1e 589 default:
aafb230e
AE
590 rbd_assert(false);
591 break;
59c2be1e
YS
592 }
593 return 0;
594}
595
602adf40
YS
596/*
597 * Get a ceph client with specific addr and configuration, if one does
598 * not exist create it.
599 */
9d3997fd 600static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
602adf40 601{
f8c38929 602 struct rbd_client *rbdc;
59c2be1e 603
1f7ba331 604 rbdc = rbd_client_find(ceph_opts);
9d3997fd 605 if (rbdc) /* using an existing client */
43ae4701 606 ceph_destroy_options(ceph_opts);
9d3997fd 607 else
f8c38929 608 rbdc = rbd_client_create(ceph_opts);
602adf40 609
9d3997fd 610 return rbdc;
602adf40
YS
611}
612
613/*
614 * Destroy ceph client
d23a4b3f 615 *
432b8587 616 * Caller must hold rbd_client_list_lock.
602adf40
YS
617 */
618static void rbd_client_release(struct kref *kref)
619{
620 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
621
37206ee5 622 dout("%s: rbdc %p\n", __func__, rbdc);
cd9d9f5d 623 spin_lock(&rbd_client_list_lock);
602adf40 624 list_del(&rbdc->node);
cd9d9f5d 625 spin_unlock(&rbd_client_list_lock);
602adf40
YS
626
627 ceph_destroy_client(rbdc->client);
628 kfree(rbdc);
629}
630
631/*
632 * Drop reference to ceph client node. If it's not referenced anymore, release
633 * it.
634 */
9d3997fd 635static void rbd_put_client(struct rbd_client *rbdc)
602adf40 636{
c53d5893
AE
637 if (rbdc)
638 kref_put(&rbdc->kref, rbd_client_release);
602adf40
YS
639}
640
a30b71b9
AE
641static bool rbd_image_format_valid(u32 image_format)
642{
643 return image_format == 1 || image_format == 2;
644}
645
8e94af8e
AE
646static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
647{
103a150f
AE
648 size_t size;
649 u32 snap_count;
650
651 /* The header has to start with the magic rbd header text */
652 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
653 return false;
654
db2388b6
AE
655 /* The bio layer requires at least sector-sized I/O */
656
657 if (ondisk->options.order < SECTOR_SHIFT)
658 return false;
659
660 /* If we use u64 in a few spots we may be able to loosen this */
661
662 if (ondisk->options.order > 8 * sizeof (int) - 1)
663 return false;
664
103a150f
AE
665 /*
666 * The size of a snapshot header has to fit in a size_t, and
667 * that limits the number of snapshots.
668 */
669 snap_count = le32_to_cpu(ondisk->snap_count);
670 size = SIZE_MAX - sizeof (struct ceph_snap_context);
671 if (snap_count > size / sizeof (__le64))
672 return false;
673
674 /*
675 * Not only that, but the size of the entire the snapshot
676 * header must also be representable in a size_t.
677 */
678 size -= snap_count * sizeof (__le64);
679 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
680 return false;
681
682 return true;
8e94af8e
AE
683}
684
602adf40
YS
685/*
686 * Create a new header structure, translate header format from the on-disk
687 * header.
688 */
689static int rbd_header_from_disk(struct rbd_image_header *header,
4156d998 690 struct rbd_image_header_ondisk *ondisk)
602adf40 691{
ccece235 692 u32 snap_count;
58c17b0e 693 size_t len;
d2bb24e5 694 size_t size;
621901d6 695 u32 i;
602adf40 696
6a52325f
AE
697 memset(header, 0, sizeof (*header));
698
103a150f
AE
699 snap_count = le32_to_cpu(ondisk->snap_count);
700
58c17b0e
AE
701 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
702 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6a52325f 703 if (!header->object_prefix)
602adf40 704 return -ENOMEM;
58c17b0e
AE
705 memcpy(header->object_prefix, ondisk->object_prefix, len);
706 header->object_prefix[len] = '\0';
00f1f36f 707
602adf40 708 if (snap_count) {
f785cc1d
AE
709 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
710
621901d6
AE
711 /* Save a copy of the snapshot names */
712
f785cc1d
AE
713 if (snap_names_len > (u64) SIZE_MAX)
714 return -EIO;
715 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
602adf40 716 if (!header->snap_names)
6a52325f 717 goto out_err;
f785cc1d
AE
718 /*
719 * Note that rbd_dev_v1_header_read() guarantees
720 * the ondisk buffer we're working with has
721 * snap_names_len bytes beyond the end of the
722 * snapshot id array, this memcpy() is safe.
723 */
724 memcpy(header->snap_names, &ondisk->snaps[snap_count],
725 snap_names_len);
6a52325f 726
621901d6
AE
727 /* Record each snapshot's size */
728
d2bb24e5
AE
729 size = snap_count * sizeof (*header->snap_sizes);
730 header->snap_sizes = kmalloc(size, GFP_KERNEL);
602adf40 731 if (!header->snap_sizes)
6a52325f 732 goto out_err;
621901d6
AE
733 for (i = 0; i < snap_count; i++)
734 header->snap_sizes[i] =
735 le64_to_cpu(ondisk->snaps[i].image_size);
602adf40 736 } else {
ccece235 737 WARN_ON(ondisk->snap_names_len);
602adf40
YS
738 header->snap_names = NULL;
739 header->snap_sizes = NULL;
740 }
849b4260 741
34b13184 742 header->features = 0; /* No features support in v1 images */
602adf40
YS
743 header->obj_order = ondisk->options.order;
744 header->crypt_type = ondisk->options.crypt_type;
745 header->comp_type = ondisk->options.comp_type;
6a52325f 746
621901d6
AE
747 /* Allocate and fill in the snapshot context */
748
f84344f3 749 header->image_size = le64_to_cpu(ondisk->image_size);
6a52325f
AE
750 size = sizeof (struct ceph_snap_context);
751 size += snap_count * sizeof (header->snapc->snaps[0]);
752 header->snapc = kzalloc(size, GFP_KERNEL);
753 if (!header->snapc)
754 goto out_err;
602adf40
YS
755
756 atomic_set(&header->snapc->nref, 1);
505cbb9b 757 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 758 header->snapc->num_snaps = snap_count;
621901d6
AE
759 for (i = 0; i < snap_count; i++)
760 header->snapc->snaps[i] =
761 le64_to_cpu(ondisk->snaps[i].id);
602adf40
YS
762
763 return 0;
764
6a52325f 765out_err:
849b4260 766 kfree(header->snap_sizes);
ccece235 767 header->snap_sizes = NULL;
602adf40 768 kfree(header->snap_names);
ccece235 769 header->snap_names = NULL;
6a52325f
AE
770 kfree(header->object_prefix);
771 header->object_prefix = NULL;
ccece235 772
00f1f36f 773 return -ENOMEM;
602adf40
YS
774}
775
9e15b77d
AE
776static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
777{
778 struct rbd_snap *snap;
779
780 if (snap_id == CEPH_NOSNAP)
781 return RBD_SNAP_HEAD_NAME;
782
783 list_for_each_entry(snap, &rbd_dev->snaps, node)
784 if (snap_id == snap->id)
785 return snap->name;
786
787 return NULL;
788}
789
8836b995 790static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
602adf40 791{
602adf40 792
e86924a8 793 struct rbd_snap *snap;
602adf40 794
e86924a8
AE
795 list_for_each_entry(snap, &rbd_dev->snaps, node) {
796 if (!strcmp(snap_name, snap->name)) {
0d7dbfce 797 rbd_dev->spec->snap_id = snap->id;
e86924a8 798 rbd_dev->mapping.size = snap->size;
34b13184 799 rbd_dev->mapping.features = snap->features;
602adf40 800
e86924a8 801 return 0;
00f1f36f 802 }
00f1f36f 803 }
e86924a8 804
00f1f36f 805 return -ENOENT;
602adf40
YS
806}
807
819d52bf 808static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
602adf40 809{
78dc447d 810 int ret;
602adf40 811
0d7dbfce 812 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
cc9d734c 813 sizeof (RBD_SNAP_HEAD_NAME))) {
0d7dbfce 814 rbd_dev->spec->snap_id = CEPH_NOSNAP;
99c1f08f 815 rbd_dev->mapping.size = rbd_dev->header.image_size;
34b13184 816 rbd_dev->mapping.features = rbd_dev->header.features;
e86924a8 817 ret = 0;
602adf40 818 } else {
0d7dbfce 819 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
602adf40
YS
820 if (ret < 0)
821 goto done;
f84344f3 822 rbd_dev->mapping.read_only = true;
602adf40 823 }
6d292906
AE
824 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
825
602adf40 826done:
602adf40
YS
827 return ret;
828}
829
830static void rbd_header_free(struct rbd_image_header *header)
831{
849b4260 832 kfree(header->object_prefix);
d78fd7ae 833 header->object_prefix = NULL;
602adf40 834 kfree(header->snap_sizes);
d78fd7ae 835 header->snap_sizes = NULL;
849b4260 836 kfree(header->snap_names);
d78fd7ae 837 header->snap_names = NULL;
d1d25646 838 ceph_put_snap_context(header->snapc);
d78fd7ae 839 header->snapc = NULL;
602adf40
YS
840}
841
98571b5a 842static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
602adf40 843{
65ccfe21
AE
844 char *name;
845 u64 segment;
846 int ret;
602adf40 847
2fd82b9e 848 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
65ccfe21
AE
849 if (!name)
850 return NULL;
851 segment = offset >> rbd_dev->header.obj_order;
2fd82b9e 852 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
65ccfe21 853 rbd_dev->header.object_prefix, segment);
2fd82b9e 854 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
65ccfe21
AE
855 pr_err("error formatting segment name for #%llu (%d)\n",
856 segment, ret);
857 kfree(name);
858 name = NULL;
859 }
602adf40 860
65ccfe21
AE
861 return name;
862}
602adf40 863
65ccfe21
AE
864static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
865{
866 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
602adf40 867
65ccfe21
AE
868 return offset & (segment_size - 1);
869}
870
871static u64 rbd_segment_length(struct rbd_device *rbd_dev,
872 u64 offset, u64 length)
873{
874 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
875
876 offset &= segment_size - 1;
877
aafb230e 878 rbd_assert(length <= U64_MAX - offset);
65ccfe21
AE
879 if (offset + length > segment_size)
880 length = segment_size - offset;
881
882 return length;
602adf40
YS
883}
884
029bcbd8
JD
885/*
886 * returns the size of an object in the image
887 */
888static u64 rbd_obj_bytes(struct rbd_image_header *header)
889{
890 return 1 << header->obj_order;
891}
892
602adf40
YS
893/*
894 * bio helpers
895 */
896
897static void bio_chain_put(struct bio *chain)
898{
899 struct bio *tmp;
900
901 while (chain) {
902 tmp = chain;
903 chain = chain->bi_next;
904 bio_put(tmp);
905 }
906}
907
908/*
909 * zeros a bio chain, starting at specific offset
910 */
911static void zero_bio_chain(struct bio *chain, int start_ofs)
912{
913 struct bio_vec *bv;
914 unsigned long flags;
915 void *buf;
916 int i;
917 int pos = 0;
918
919 while (chain) {
920 bio_for_each_segment(bv, chain, i) {
921 if (pos + bv->bv_len > start_ofs) {
922 int remainder = max(start_ofs - pos, 0);
923 buf = bvec_kmap_irq(bv, &flags);
924 memset(buf + remainder, 0,
925 bv->bv_len - remainder);
85b5aaa6 926 bvec_kunmap_irq(buf, &flags);
602adf40
YS
927 }
928 pos += bv->bv_len;
929 }
930
931 chain = chain->bi_next;
932 }
933}
934
935/*
f7760dad
AE
936 * Clone a portion of a bio, starting at the given byte offset
937 * and continuing for the number of bytes indicated.
602adf40 938 */
f7760dad
AE
939static struct bio *bio_clone_range(struct bio *bio_src,
940 unsigned int offset,
941 unsigned int len,
942 gfp_t gfpmask)
602adf40 943{
f7760dad
AE
944 struct bio_vec *bv;
945 unsigned int resid;
946 unsigned short idx;
947 unsigned int voff;
948 unsigned short end_idx;
949 unsigned short vcnt;
950 struct bio *bio;
951
952 /* Handle the easy case for the caller */
953
954 if (!offset && len == bio_src->bi_size)
955 return bio_clone(bio_src, gfpmask);
956
957 if (WARN_ON_ONCE(!len))
958 return NULL;
959 if (WARN_ON_ONCE(len > bio_src->bi_size))
960 return NULL;
961 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
962 return NULL;
963
964 /* Find first affected segment... */
965
966 resid = offset;
967 __bio_for_each_segment(bv, bio_src, idx, 0) {
968 if (resid < bv->bv_len)
969 break;
970 resid -= bv->bv_len;
602adf40 971 }
f7760dad 972 voff = resid;
602adf40 973
f7760dad 974 /* ...and the last affected segment */
602adf40 975
f7760dad
AE
976 resid += len;
977 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
978 if (resid <= bv->bv_len)
979 break;
980 resid -= bv->bv_len;
981 }
982 vcnt = end_idx - idx + 1;
983
984 /* Build the clone */
985
986 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
987 if (!bio)
988 return NULL; /* ENOMEM */
602adf40 989
f7760dad
AE
990 bio->bi_bdev = bio_src->bi_bdev;
991 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
992 bio->bi_rw = bio_src->bi_rw;
993 bio->bi_flags |= 1 << BIO_CLONED;
994
995 /*
996 * Copy over our part of the bio_vec, then update the first
997 * and last (or only) entries.
998 */
999 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1000 vcnt * sizeof (struct bio_vec));
1001 bio->bi_io_vec[0].bv_offset += voff;
1002 if (vcnt > 1) {
1003 bio->bi_io_vec[0].bv_len -= voff;
1004 bio->bi_io_vec[vcnt - 1].bv_len = resid;
1005 } else {
1006 bio->bi_io_vec[0].bv_len = len;
602adf40
YS
1007 }
1008
f7760dad
AE
1009 bio->bi_vcnt = vcnt;
1010 bio->bi_size = len;
1011 bio->bi_idx = 0;
1012
1013 return bio;
1014}
1015
1016/*
1017 * Clone a portion of a bio chain, starting at the given byte offset
1018 * into the first bio in the source chain and continuing for the
1019 * number of bytes indicated. The result is another bio chain of
1020 * exactly the given length, or a null pointer on error.
1021 *
1022 * The bio_src and offset parameters are both in-out. On entry they
1023 * refer to the first source bio and the offset into that bio where
1024 * the start of data to be cloned is located.
1025 *
1026 * On return, bio_src is updated to refer to the bio in the source
1027 * chain that contains first un-cloned byte, and *offset will
1028 * contain the offset of that byte within that bio.
1029 */
1030static struct bio *bio_chain_clone_range(struct bio **bio_src,
1031 unsigned int *offset,
1032 unsigned int len,
1033 gfp_t gfpmask)
1034{
1035 struct bio *bi = *bio_src;
1036 unsigned int off = *offset;
1037 struct bio *chain = NULL;
1038 struct bio **end;
1039
1040 /* Build up a chain of clone bios up to the limit */
1041
1042 if (!bi || off >= bi->bi_size || !len)
1043 return NULL; /* Nothing to clone */
602adf40 1044
f7760dad
AE
1045 end = &chain;
1046 while (len) {
1047 unsigned int bi_size;
1048 struct bio *bio;
1049
f5400b7a
AE
1050 if (!bi) {
1051 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
f7760dad 1052 goto out_err; /* EINVAL; ran out of bio's */
f5400b7a 1053 }
f7760dad
AE
1054 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1055 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1056 if (!bio)
1057 goto out_err; /* ENOMEM */
1058
1059 *end = bio;
1060 end = &bio->bi_next;
602adf40 1061
f7760dad
AE
1062 off += bi_size;
1063 if (off == bi->bi_size) {
1064 bi = bi->bi_next;
1065 off = 0;
1066 }
1067 len -= bi_size;
1068 }
1069 *bio_src = bi;
1070 *offset = off;
1071
1072 return chain;
1073out_err:
1074 bio_chain_put(chain);
602adf40 1075
602adf40
YS
1076 return NULL;
1077}
1078
926f9b3f
AE
1079/*
1080 * The default/initial value for all object request flags is 0. For
1081 * each flag, once its value is set to 1 it is never reset to 0
1082 * again.
1083 */
1084static void obj_request_done_set(struct rbd_obj_request *obj_request)
1085{
1086 if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1087 struct rbd_img_request *img_request = obj_request->img_request;
1088 struct rbd_device *rbd_dev;
1089
1090 rbd_dev = img_request ? img_request->rbd_dev : NULL;
1091 rbd_warn(rbd_dev, "obj_request %p already marked done\n",
1092 obj_request);
1093 }
1094}
1095
1096static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1097{
1098 smp_mb();
1099 return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1100}
1101
bf0d5f50
AE
1102static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1103{
37206ee5
AE
1104 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1105 atomic_read(&obj_request->kref.refcount));
bf0d5f50
AE
1106 kref_get(&obj_request->kref);
1107}
1108
1109static void rbd_obj_request_destroy(struct kref *kref);
1110static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1111{
1112 rbd_assert(obj_request != NULL);
37206ee5
AE
1113 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1114 atomic_read(&obj_request->kref.refcount));
bf0d5f50
AE
1115 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1116}
1117
1118static void rbd_img_request_get(struct rbd_img_request *img_request)
1119{
37206ee5
AE
1120 dout("%s: img %p (was %d)\n", __func__, img_request,
1121 atomic_read(&img_request->kref.refcount));
bf0d5f50
AE
1122 kref_get(&img_request->kref);
1123}
1124
1125static void rbd_img_request_destroy(struct kref *kref);
1126static void rbd_img_request_put(struct rbd_img_request *img_request)
1127{
1128 rbd_assert(img_request != NULL);
37206ee5
AE
1129 dout("%s: img %p (was %d)\n", __func__, img_request,
1130 atomic_read(&img_request->kref.refcount));
bf0d5f50
AE
1131 kref_put(&img_request->kref, rbd_img_request_destroy);
1132}
1133
1134static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1135 struct rbd_obj_request *obj_request)
1136{
25dcf954
AE
1137 rbd_assert(obj_request->img_request == NULL);
1138
bf0d5f50
AE
1139 rbd_obj_request_get(obj_request);
1140 obj_request->img_request = img_request;
25dcf954 1141 obj_request->which = img_request->obj_request_count;
bf0d5f50 1142 rbd_assert(obj_request->which != BAD_WHICH);
25dcf954
AE
1143 img_request->obj_request_count++;
1144 list_add_tail(&obj_request->links, &img_request->obj_requests);
37206ee5
AE
1145 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1146 obj_request->which);
bf0d5f50
AE
1147}
1148
1149static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1150 struct rbd_obj_request *obj_request)
1151{
1152 rbd_assert(obj_request->which != BAD_WHICH);
25dcf954 1153
37206ee5
AE
1154 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1155 obj_request->which);
bf0d5f50 1156 list_del(&obj_request->links);
25dcf954
AE
1157 rbd_assert(img_request->obj_request_count > 0);
1158 img_request->obj_request_count--;
1159 rbd_assert(obj_request->which == img_request->obj_request_count);
1160 obj_request->which = BAD_WHICH;
bf0d5f50 1161 rbd_assert(obj_request->img_request == img_request);
bf0d5f50 1162 obj_request->img_request = NULL;
25dcf954 1163 obj_request->callback = NULL;
bf0d5f50
AE
1164 rbd_obj_request_put(obj_request);
1165}
1166
1167static bool obj_request_type_valid(enum obj_request_type type)
1168{
1169 switch (type) {
9969ebc5 1170 case OBJ_REQUEST_NODATA:
bf0d5f50 1171 case OBJ_REQUEST_BIO:
788e2df3 1172 case OBJ_REQUEST_PAGES:
bf0d5f50
AE
1173 return true;
1174 default:
1175 return false;
1176 }
1177}
1178
bf0d5f50
AE
1179static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1180 struct rbd_obj_request *obj_request)
1181{
37206ee5
AE
1182 dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1183
bf0d5f50
AE
1184 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1185}
1186
1187static void rbd_img_request_complete(struct rbd_img_request *img_request)
1188{
55f27e09 1189
37206ee5 1190 dout("%s: img %p\n", __func__, img_request);
55f27e09
AE
1191
1192 /*
1193 * If no error occurred, compute the aggregate transfer
1194 * count for the image request. We could instead use
1195 * atomic64_cmpxchg() to update it as each object request
1196 * completes; not clear which way is better off hand.
1197 */
1198 if (!img_request->result) {
1199 struct rbd_obj_request *obj_request;
1200 u64 xferred = 0;
1201
1202 for_each_obj_request(img_request, obj_request)
1203 xferred += obj_request->xferred;
1204 img_request->xferred = xferred;
1205 }
1206
bf0d5f50
AE
1207 if (img_request->callback)
1208 img_request->callback(img_request);
1209 else
1210 rbd_img_request_put(img_request);
1211}
1212
788e2df3
AE
1213/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1214
1215static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1216{
37206ee5
AE
1217 dout("%s: obj %p\n", __func__, obj_request);
1218
788e2df3
AE
1219 return wait_for_completion_interruptible(&obj_request->completion);
1220}
1221
0c425248
AE
1222/*
1223 * The default/initial value for all image request flags is 0. Each
1224 * is conditionally set to 1 at image request initialization time
1225 * and currently never change thereafter.
1226 */
1227static void img_request_write_set(struct rbd_img_request *img_request)
1228{
1229 set_bit(IMG_REQ_WRITE, &img_request->flags);
1230 smp_mb();
1231}
1232
1233static bool img_request_write_test(struct rbd_img_request *img_request)
1234{
1235 smp_mb();
1236 return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1237}
1238
9849e986
AE
1239static void img_request_child_set(struct rbd_img_request *img_request)
1240{
1241 set_bit(IMG_REQ_CHILD, &img_request->flags);
1242 smp_mb();
1243}
1244
1245static bool img_request_child_test(struct rbd_img_request *img_request)
1246{
1247 smp_mb();
1248 return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1249}
1250
d0b2e944
AE
1251static void img_request_layered_set(struct rbd_img_request *img_request)
1252{
1253 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1254 smp_mb();
1255}
1256
1257static bool img_request_layered_test(struct rbd_img_request *img_request)
1258{
1259 smp_mb();
1260 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1261}
1262
6e2a4505
AE
1263static void
1264rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1265{
1266 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1267 obj_request, obj_request->img_request, obj_request->result,
1268 obj_request->xferred, obj_request->length);
1269 /*
1270 * ENOENT means a hole in the image. We zero-fill the
1271 * entire length of the request. A short read also implies
1272 * zero-fill to the end of the request. Either way we
1273 * update the xferred count to indicate the whole request
1274 * was satisfied.
1275 */
1276 BUG_ON(obj_request->type != OBJ_REQUEST_BIO);
1277 if (obj_request->result == -ENOENT) {
1278 zero_bio_chain(obj_request->bio_list, 0);
1279 obj_request->result = 0;
1280 obj_request->xferred = obj_request->length;
1281 } else if (obj_request->xferred < obj_request->length &&
1282 !obj_request->result) {
1283 zero_bio_chain(obj_request->bio_list, obj_request->xferred);
1284 obj_request->xferred = obj_request->length;
1285 }
1286 obj_request_done_set(obj_request);
1287}
1288
bf0d5f50
AE
1289static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1290{
37206ee5
AE
1291 dout("%s: obj %p cb %p\n", __func__, obj_request,
1292 obj_request->callback);
bf0d5f50
AE
1293 if (obj_request->callback)
1294 obj_request->callback(obj_request);
788e2df3
AE
1295 else
1296 complete_all(&obj_request->completion);
bf0d5f50
AE
1297}
1298
c47f9371 1299static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
39bf2c5d
AE
1300{
1301 dout("%s: obj %p\n", __func__, obj_request);
1302 obj_request_done_set(obj_request);
1303}
1304
c47f9371 1305static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
bf0d5f50 1306{
37206ee5 1307 dout("%s: obj %p result %d %llu/%llu\n", __func__, obj_request,
c47f9371 1308 obj_request->result, obj_request->xferred, obj_request->length);
6e2a4505
AE
1309 if (obj_request->img_request)
1310 rbd_img_obj_request_read_callback(obj_request);
1311 else
1312 obj_request_done_set(obj_request);
bf0d5f50
AE
1313}
1314
c47f9371 1315static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
bf0d5f50 1316{
1b83bef2
SW
1317 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1318 obj_request->result, obj_request->length);
1319 /*
1320 * There is no such thing as a successful short write.
1321 * Our xferred value is the number of bytes transferred
1322 * back. Set it to our originally-requested length.
1323 */
1324 obj_request->xferred = obj_request->length;
07741308 1325 obj_request_done_set(obj_request);
bf0d5f50
AE
1326}
1327
fbfab539
AE
1328/*
1329 * For a simple stat call there's nothing to do. We'll do more if
1330 * this is part of a write sequence for a layered image.
1331 */
c47f9371 1332static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
fbfab539 1333{
37206ee5 1334 dout("%s: obj %p\n", __func__, obj_request);
fbfab539
AE
1335 obj_request_done_set(obj_request);
1336}
1337
bf0d5f50
AE
1338static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1339 struct ceph_msg *msg)
1340{
1341 struct rbd_obj_request *obj_request = osd_req->r_priv;
bf0d5f50
AE
1342 u16 opcode;
1343
37206ee5 1344 dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
bf0d5f50
AE
1345 rbd_assert(osd_req == obj_request->osd_req);
1346 rbd_assert(!!obj_request->img_request ^
1347 (obj_request->which == BAD_WHICH));
1348
1b83bef2
SW
1349 if (osd_req->r_result < 0)
1350 obj_request->result = osd_req->r_result;
bf0d5f50
AE
1351 obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1352
1b83bef2 1353 WARN_ON(osd_req->r_num_ops != 1); /* For now */
bf0d5f50 1354
c47f9371
AE
1355 /*
1356 * We support a 64-bit length, but ultimately it has to be
1357 * passed to blk_end_request(), which takes an unsigned int.
1358 */
1b83bef2 1359 obj_request->xferred = osd_req->r_reply_op_len[0];
c47f9371 1360 rbd_assert(obj_request->xferred < (u64) UINT_MAX);
79528734 1361 opcode = osd_req->r_ops[0].op;
bf0d5f50
AE
1362 switch (opcode) {
1363 case CEPH_OSD_OP_READ:
c47f9371 1364 rbd_osd_read_callback(obj_request);
bf0d5f50
AE
1365 break;
1366 case CEPH_OSD_OP_WRITE:
c47f9371 1367 rbd_osd_write_callback(obj_request);
bf0d5f50 1368 break;
fbfab539 1369 case CEPH_OSD_OP_STAT:
c47f9371 1370 rbd_osd_stat_callback(obj_request);
fbfab539 1371 break;
36be9a76 1372 case CEPH_OSD_OP_CALL:
b8d70035 1373 case CEPH_OSD_OP_NOTIFY_ACK:
9969ebc5 1374 case CEPH_OSD_OP_WATCH:
c47f9371 1375 rbd_osd_trivial_callback(obj_request);
9969ebc5 1376 break;
bf0d5f50
AE
1377 default:
1378 rbd_warn(NULL, "%s: unsupported op %hu\n",
1379 obj_request->object_name, (unsigned short) opcode);
1380 break;
1381 }
1382
07741308 1383 if (obj_request_done_test(obj_request))
bf0d5f50
AE
1384 rbd_obj_request_complete(obj_request);
1385}
1386
2fa12320 1387static void rbd_osd_req_format(struct rbd_obj_request *obj_request,
79528734 1388 bool write_request)
430c28c3
AE
1389{
1390 struct rbd_img_request *img_request = obj_request->img_request;
8c042b0d 1391 struct ceph_osd_request *osd_req = obj_request->osd_req;
430c28c3
AE
1392 struct ceph_snap_context *snapc = NULL;
1393 u64 snap_id = CEPH_NOSNAP;
1394 struct timespec *mtime = NULL;
1395 struct timespec now;
1396
8c042b0d 1397 rbd_assert(osd_req != NULL);
430c28c3
AE
1398
1399 if (write_request) {
1400 now = CURRENT_TIME;
1401 mtime = &now;
1402 if (img_request)
1403 snapc = img_request->snapc;
2fa12320
AE
1404 } else if (img_request) {
1405 snap_id = img_request->snap_id;
8c042b0d
AE
1406 }
1407 ceph_osdc_build_request(osd_req, obj_request->offset,
79528734 1408 snapc, snap_id, mtime);
430c28c3
AE
1409}
1410
bf0d5f50
AE
1411static struct ceph_osd_request *rbd_osd_req_create(
1412 struct rbd_device *rbd_dev,
1413 bool write_request,
430c28c3 1414 struct rbd_obj_request *obj_request)
bf0d5f50
AE
1415{
1416 struct rbd_img_request *img_request = obj_request->img_request;
1417 struct ceph_snap_context *snapc = NULL;
1418 struct ceph_osd_client *osdc;
1419 struct ceph_osd_request *osd_req;
bf0d5f50
AE
1420
1421 if (img_request) {
0c425248
AE
1422 rbd_assert(write_request ==
1423 img_request_write_test(img_request));
1424 if (write_request)
bf0d5f50 1425 snapc = img_request->snapc;
bf0d5f50
AE
1426 }
1427
1428 /* Allocate and initialize the request, for the single op */
1429
1430 osdc = &rbd_dev->rbd_client->client->osdc;
1431 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1432 if (!osd_req)
1433 return NULL; /* ENOMEM */
bf0d5f50 1434
430c28c3 1435 if (write_request)
bf0d5f50 1436 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
430c28c3 1437 else
bf0d5f50 1438 osd_req->r_flags = CEPH_OSD_FLAG_READ;
bf0d5f50
AE
1439
1440 osd_req->r_callback = rbd_osd_req_callback;
1441 osd_req->r_priv = obj_request;
1442
1443 osd_req->r_oid_len = strlen(obj_request->object_name);
1444 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1445 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1446
1447 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1448
bf0d5f50
AE
1449 return osd_req;
1450}
1451
1452static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1453{
1454 ceph_osdc_put_request(osd_req);
1455}
1456
1457/* object_name is assumed to be a non-null pointer and NUL-terminated */
1458
1459static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1460 u64 offset, u64 length,
1461 enum obj_request_type type)
1462{
1463 struct rbd_obj_request *obj_request;
1464 size_t size;
1465 char *name;
1466
1467 rbd_assert(obj_request_type_valid(type));
1468
1469 size = strlen(object_name) + 1;
1470 obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1471 if (!obj_request)
1472 return NULL;
1473
1474 name = (char *)(obj_request + 1);
1475 obj_request->object_name = memcpy(name, object_name, size);
1476 obj_request->offset = offset;
1477 obj_request->length = length;
926f9b3f 1478 obj_request->flags = 0;
bf0d5f50
AE
1479 obj_request->which = BAD_WHICH;
1480 obj_request->type = type;
1481 INIT_LIST_HEAD(&obj_request->links);
788e2df3 1482 init_completion(&obj_request->completion);
bf0d5f50
AE
1483 kref_init(&obj_request->kref);
1484
37206ee5
AE
1485 dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1486 offset, length, (int)type, obj_request);
1487
bf0d5f50
AE
1488 return obj_request;
1489}
1490
1491static void rbd_obj_request_destroy(struct kref *kref)
1492{
1493 struct rbd_obj_request *obj_request;
1494
1495 obj_request = container_of(kref, struct rbd_obj_request, kref);
1496
37206ee5
AE
1497 dout("%s: obj %p\n", __func__, obj_request);
1498
bf0d5f50
AE
1499 rbd_assert(obj_request->img_request == NULL);
1500 rbd_assert(obj_request->which == BAD_WHICH);
1501
1502 if (obj_request->osd_req)
1503 rbd_osd_req_destroy(obj_request->osd_req);
1504
1505 rbd_assert(obj_request_type_valid(obj_request->type));
1506 switch (obj_request->type) {
9969ebc5
AE
1507 case OBJ_REQUEST_NODATA:
1508 break; /* Nothing to do */
bf0d5f50
AE
1509 case OBJ_REQUEST_BIO:
1510 if (obj_request->bio_list)
1511 bio_chain_put(obj_request->bio_list);
1512 break;
788e2df3
AE
1513 case OBJ_REQUEST_PAGES:
1514 if (obj_request->pages)
1515 ceph_release_page_vector(obj_request->pages,
1516 obj_request->page_count);
1517 break;
bf0d5f50
AE
1518 }
1519
1520 kfree(obj_request);
1521}
1522
1523/*
1524 * Caller is responsible for filling in the list of object requests
1525 * that comprises the image request, and the Linux request pointer
1526 * (if there is one).
1527 */
cc344fa1
AE
1528static struct rbd_img_request *rbd_img_request_create(
1529 struct rbd_device *rbd_dev,
bf0d5f50 1530 u64 offset, u64 length,
9849e986
AE
1531 bool write_request,
1532 bool child_request)
bf0d5f50
AE
1533{
1534 struct rbd_img_request *img_request;
1535 struct ceph_snap_context *snapc = NULL;
1536
1537 img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1538 if (!img_request)
1539 return NULL;
1540
1541 if (write_request) {
1542 down_read(&rbd_dev->header_rwsem);
1543 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1544 up_read(&rbd_dev->header_rwsem);
1545 if (WARN_ON(!snapc)) {
1546 kfree(img_request);
1547 return NULL; /* Shouldn't happen */
1548 }
0c425248 1549
bf0d5f50
AE
1550 }
1551
1552 img_request->rq = NULL;
1553 img_request->rbd_dev = rbd_dev;
1554 img_request->offset = offset;
1555 img_request->length = length;
0c425248
AE
1556 img_request->flags = 0;
1557 if (write_request) {
1558 img_request_write_set(img_request);
bf0d5f50 1559 img_request->snapc = snapc;
0c425248 1560 } else {
bf0d5f50 1561 img_request->snap_id = rbd_dev->spec->snap_id;
0c425248 1562 }
9849e986
AE
1563 if (child_request)
1564 img_request_child_set(img_request);
d0b2e944
AE
1565 if (rbd_dev->parent_spec)
1566 img_request_layered_set(img_request);
bf0d5f50
AE
1567 spin_lock_init(&img_request->completion_lock);
1568 img_request->next_completion = 0;
1569 img_request->callback = NULL;
a5a337d4 1570 img_request->result = 0;
bf0d5f50
AE
1571 img_request->obj_request_count = 0;
1572 INIT_LIST_HEAD(&img_request->obj_requests);
1573 kref_init(&img_request->kref);
1574
d0b2e944 1575 (void) img_request_layered_test(img_request); /* Avoid a warning */
bf0d5f50
AE
1576 rbd_img_request_get(img_request); /* Avoid a warning */
1577 rbd_img_request_put(img_request); /* TEMPORARY */
1578
37206ee5
AE
1579 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
1580 write_request ? "write" : "read", offset, length,
1581 img_request);
1582
bf0d5f50
AE
1583 return img_request;
1584}
1585
1586static void rbd_img_request_destroy(struct kref *kref)
1587{
1588 struct rbd_img_request *img_request;
1589 struct rbd_obj_request *obj_request;
1590 struct rbd_obj_request *next_obj_request;
1591
1592 img_request = container_of(kref, struct rbd_img_request, kref);
1593
37206ee5
AE
1594 dout("%s: img %p\n", __func__, img_request);
1595
bf0d5f50
AE
1596 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1597 rbd_img_obj_request_del(img_request, obj_request);
25dcf954 1598 rbd_assert(img_request->obj_request_count == 0);
bf0d5f50 1599
0c425248 1600 if (img_request_write_test(img_request))
bf0d5f50
AE
1601 ceph_put_snap_context(img_request->snapc);
1602
1603 kfree(img_request);
1604}
1605
1217857f
AE
1606static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
1607{
1608 struct rbd_img_request *img_request = obj_request->img_request;
1609 unsigned int xferred;
1610 int result;
1611
1612 rbd_assert(!img_request_child_test(img_request));
1613 rbd_assert(img_request->rq != NULL);
1614
1615 rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
1616 xferred = (unsigned int)obj_request->xferred;
1617 result = obj_request->result;
1618 if (result) {
1619 struct rbd_device *rbd_dev = img_request->rbd_dev;
1620
1621 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
1622 img_request_write_test(img_request) ? "write" : "read",
1623 obj_request->length, obj_request->img_offset,
1624 obj_request->offset);
1625 rbd_warn(rbd_dev, " result %d xferred %x\n",
1626 result, xferred);
1627 if (!img_request->result)
1628 img_request->result = result;
1629 }
1630
1631 return blk_end_request(img_request->rq, result, xferred);
1632}
1633
2169238d
AE
1634static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1635{
1636 struct rbd_img_request *img_request;
1637 u32 which = obj_request->which;
1638 bool more = true;
1639
1640 img_request = obj_request->img_request;
1641
1642 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1643 rbd_assert(img_request != NULL);
2169238d
AE
1644 rbd_assert(img_request->obj_request_count > 0);
1645 rbd_assert(which != BAD_WHICH);
1646 rbd_assert(which < img_request->obj_request_count);
1647 rbd_assert(which >= img_request->next_completion);
1648
1649 spin_lock_irq(&img_request->completion_lock);
1650 if (which != img_request->next_completion)
1651 goto out;
1652
1653 for_each_obj_request_from(img_request, obj_request) {
2169238d
AE
1654 rbd_assert(more);
1655 rbd_assert(which < img_request->obj_request_count);
1656
1657 if (!obj_request_done_test(obj_request))
1658 break;
1217857f 1659 more = rbd_img_obj_end_request(obj_request);
2169238d
AE
1660 which++;
1661 }
1662
1663 rbd_assert(more ^ (which == img_request->obj_request_count));
1664 img_request->next_completion = which;
1665out:
1666 spin_unlock_irq(&img_request->completion_lock);
1667
1668 if (!more)
1669 rbd_img_request_complete(img_request);
1670}
1671
bf0d5f50
AE
1672static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1673 struct bio *bio_list)
1674{
1675 struct rbd_device *rbd_dev = img_request->rbd_dev;
1676 struct rbd_obj_request *obj_request = NULL;
1677 struct rbd_obj_request *next_obj_request;
0c425248 1678 bool write_request = img_request_write_test(img_request);
bf0d5f50 1679 unsigned int bio_offset;
7da22d29 1680 u64 img_offset;
bf0d5f50
AE
1681 u64 resid;
1682 u16 opcode;
1683
37206ee5
AE
1684 dout("%s: img %p bio %p\n", __func__, img_request, bio_list);
1685
430c28c3 1686 opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
bf0d5f50 1687 bio_offset = 0;
7da22d29
AE
1688 img_offset = img_request->offset;
1689 rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
bf0d5f50 1690 resid = img_request->length;
4dda41d3 1691 rbd_assert(resid > 0);
bf0d5f50 1692 while (resid) {
2fa12320 1693 struct ceph_osd_request *osd_req;
bf0d5f50
AE
1694 const char *object_name;
1695 unsigned int clone_size;
bf0d5f50
AE
1696 u64 offset;
1697 u64 length;
1698
7da22d29 1699 object_name = rbd_segment_name(rbd_dev, img_offset);
bf0d5f50
AE
1700 if (!object_name)
1701 goto out_unwind;
7da22d29
AE
1702 offset = rbd_segment_offset(rbd_dev, img_offset);
1703 length = rbd_segment_length(rbd_dev, img_offset, resid);
bf0d5f50
AE
1704 obj_request = rbd_obj_request_create(object_name,
1705 offset, length,
1706 OBJ_REQUEST_BIO);
1707 kfree(object_name); /* object request has its own copy */
1708 if (!obj_request)
1709 goto out_unwind;
1710
1711 rbd_assert(length <= (u64) UINT_MAX);
1712 clone_size = (unsigned int) length;
1713 obj_request->bio_list = bio_chain_clone_range(&bio_list,
1714 &bio_offset, clone_size,
1715 GFP_ATOMIC);
1716 if (!obj_request->bio_list)
1717 goto out_partial;
1718
2fa12320
AE
1719 osd_req = rbd_osd_req_create(rbd_dev, write_request,
1720 obj_request);
1721 if (!osd_req)
bf0d5f50 1722 goto out_partial;
2fa12320 1723 obj_request->osd_req = osd_req;
2169238d 1724 obj_request->callback = rbd_img_obj_callback;
430c28c3 1725
2fa12320
AE
1726 osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
1727 0, 0);
a4ce40a9
AE
1728 osd_req_op_extent_osd_data_bio(osd_req, 0, write_request,
1729 obj_request->bio_list, obj_request->length);
2fa12320 1730 rbd_osd_req_format(obj_request, write_request);
430c28c3 1731
7da22d29 1732 obj_request->img_offset = img_offset;
bf0d5f50
AE
1733 rbd_img_obj_request_add(img_request, obj_request);
1734
7da22d29 1735 img_offset += length;
bf0d5f50
AE
1736 resid -= length;
1737 }
1738
1739 return 0;
1740
1741out_partial:
1742 rbd_obj_request_put(obj_request);
1743out_unwind:
1744 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1745 rbd_obj_request_put(obj_request);
1746
1747 return -ENOMEM;
1748}
1749
bf0d5f50
AE
1750static int rbd_img_request_submit(struct rbd_img_request *img_request)
1751{
1752 struct rbd_device *rbd_dev = img_request->rbd_dev;
1753 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1754 struct rbd_obj_request *obj_request;
46faeed4 1755 struct rbd_obj_request *next_obj_request;
bf0d5f50 1756
37206ee5 1757 dout("%s: img %p\n", __func__, img_request);
46faeed4 1758 for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
bf0d5f50
AE
1759 int ret;
1760
bf0d5f50
AE
1761 ret = rbd_obj_request_submit(osdc, obj_request);
1762 if (ret)
1763 return ret;
1764 /*
1765 * The image request has its own reference to each
1766 * of its object requests, so we can safely drop the
1767 * initial one here.
1768 */
1769 rbd_obj_request_put(obj_request);
1770 }
1771
1772 return 0;
1773}
1774
cf81b60e 1775static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
b8d70035
AE
1776 u64 ver, u64 notify_id)
1777{
1778 struct rbd_obj_request *obj_request;
2169238d 1779 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
b8d70035
AE
1780 int ret;
1781
1782 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1783 OBJ_REQUEST_NODATA);
1784 if (!obj_request)
1785 return -ENOMEM;
1786
1787 ret = -ENOMEM;
430c28c3 1788 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
b8d70035
AE
1789 if (!obj_request->osd_req)
1790 goto out;
2169238d 1791 obj_request->callback = rbd_obj_request_put;
b8d70035 1792
c99d2d4a
AE
1793 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
1794 notify_id, ver, 0);
2fa12320 1795 rbd_osd_req_format(obj_request, false);
430c28c3 1796
b8d70035 1797 ret = rbd_obj_request_submit(osdc, obj_request);
b8d70035 1798out:
cf81b60e
AE
1799 if (ret)
1800 rbd_obj_request_put(obj_request);
b8d70035
AE
1801
1802 return ret;
1803}
1804
1805static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1806{
1807 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1808 u64 hver;
1809 int rc;
1810
1811 if (!rbd_dev)
1812 return;
1813
37206ee5 1814 dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
b8d70035
AE
1815 rbd_dev->header_name, (unsigned long long) notify_id,
1816 (unsigned int) opcode);
1817 rc = rbd_dev_refresh(rbd_dev, &hver);
1818 if (rc)
1819 rbd_warn(rbd_dev, "got notification but failed to "
1820 " update snaps: %d\n", rc);
1821
cf81b60e 1822 rbd_obj_notify_ack(rbd_dev, hver, notify_id);
b8d70035
AE
1823}
1824
9969ebc5
AE
1825/*
1826 * Request sync osd watch/unwatch. The value of "start" determines
1827 * whether a watch request is being initiated or torn down.
1828 */
1829static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
1830{
1831 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1832 struct rbd_obj_request *obj_request;
9969ebc5
AE
1833 int ret;
1834
1835 rbd_assert(start ^ !!rbd_dev->watch_event);
1836 rbd_assert(start ^ !!rbd_dev->watch_request);
1837
1838 if (start) {
3c663bbd 1839 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
9969ebc5
AE
1840 &rbd_dev->watch_event);
1841 if (ret < 0)
1842 return ret;
8eb87565 1843 rbd_assert(rbd_dev->watch_event != NULL);
9969ebc5
AE
1844 }
1845
1846 ret = -ENOMEM;
1847 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1848 OBJ_REQUEST_NODATA);
1849 if (!obj_request)
1850 goto out_cancel;
1851
430c28c3
AE
1852 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
1853 if (!obj_request->osd_req)
1854 goto out_cancel;
1855
8eb87565 1856 if (start)
975241af 1857 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
8eb87565 1858 else
6977c3f9 1859 ceph_osdc_unregister_linger_request(osdc,
975241af 1860 rbd_dev->watch_request->osd_req);
2169238d
AE
1861
1862 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
1863 rbd_dev->watch_event->cookie,
1864 rbd_dev->header.obj_version, start);
1865 rbd_osd_req_format(obj_request, true);
1866
9969ebc5
AE
1867 ret = rbd_obj_request_submit(osdc, obj_request);
1868 if (ret)
1869 goto out_cancel;
1870 ret = rbd_obj_request_wait(obj_request);
1871 if (ret)
1872 goto out_cancel;
9969ebc5
AE
1873 ret = obj_request->result;
1874 if (ret)
1875 goto out_cancel;
1876
8eb87565
AE
1877 /*
1878 * A watch request is set to linger, so the underlying osd
1879 * request won't go away until we unregister it. We retain
1880 * a pointer to the object request during that time (in
1881 * rbd_dev->watch_request), so we'll keep a reference to
1882 * it. We'll drop that reference (below) after we've
1883 * unregistered it.
1884 */
1885 if (start) {
1886 rbd_dev->watch_request = obj_request;
1887
1888 return 0;
1889 }
1890
1891 /* We have successfully torn down the watch request */
1892
1893 rbd_obj_request_put(rbd_dev->watch_request);
1894 rbd_dev->watch_request = NULL;
9969ebc5
AE
1895out_cancel:
1896 /* Cancel the event if we're tearing down, or on error */
1897 ceph_osdc_cancel_event(rbd_dev->watch_event);
1898 rbd_dev->watch_event = NULL;
9969ebc5
AE
1899 if (obj_request)
1900 rbd_obj_request_put(obj_request);
1901
1902 return ret;
1903}
1904
36be9a76
AE
1905/*
1906 * Synchronous osd object method call
1907 */
1908static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
1909 const char *object_name,
1910 const char *class_name,
1911 const char *method_name,
1912 const char *outbound,
1913 size_t outbound_size,
1914 char *inbound,
1915 size_t inbound_size,
1916 u64 *version)
1917{
2169238d 1918 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
36be9a76 1919 struct rbd_obj_request *obj_request;
36be9a76
AE
1920 struct page **pages;
1921 u32 page_count;
1922 int ret;
1923
1924 /*
6010a451
AE
1925 * Method calls are ultimately read operations. The result
1926 * should placed into the inbound buffer provided. They
1927 * also supply outbound data--parameters for the object
1928 * method. Currently if this is present it will be a
1929 * snapshot id.
36be9a76
AE
1930 */
1931 page_count = (u32) calc_pages_for(0, inbound_size);
1932 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
1933 if (IS_ERR(pages))
1934 return PTR_ERR(pages);
1935
1936 ret = -ENOMEM;
6010a451 1937 obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
36be9a76
AE
1938 OBJ_REQUEST_PAGES);
1939 if (!obj_request)
1940 goto out;
1941
1942 obj_request->pages = pages;
1943 obj_request->page_count = page_count;
1944
430c28c3 1945 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
36be9a76
AE
1946 if (!obj_request->osd_req)
1947 goto out;
1948
c99d2d4a 1949 osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
04017e29
AE
1950 class_name, method_name);
1951 if (outbound_size) {
1952 struct ceph_pagelist *pagelist;
1953
1954 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
1955 if (!pagelist)
1956 goto out;
1957
1958 ceph_pagelist_init(pagelist);
1959 ceph_pagelist_append(pagelist, outbound, outbound_size);
1960 osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
1961 pagelist);
1962 }
a4ce40a9
AE
1963 osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
1964 obj_request->pages, inbound_size,
44cd188d 1965 0, false, false);
2fa12320 1966 rbd_osd_req_format(obj_request, false);
430c28c3 1967
36be9a76
AE
1968 ret = rbd_obj_request_submit(osdc, obj_request);
1969 if (ret)
1970 goto out;
1971 ret = rbd_obj_request_wait(obj_request);
1972 if (ret)
1973 goto out;
1974
1975 ret = obj_request->result;
1976 if (ret < 0)
1977 goto out;
23ed6e13 1978 ret = 0;
903bb32e 1979 ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
36be9a76
AE
1980 if (version)
1981 *version = obj_request->version;
1982out:
1983 if (obj_request)
1984 rbd_obj_request_put(obj_request);
1985 else
1986 ceph_release_page_vector(pages, page_count);
1987
1988 return ret;
1989}
1990
bf0d5f50 1991static void rbd_request_fn(struct request_queue *q)
cc344fa1 1992 __releases(q->queue_lock) __acquires(q->queue_lock)
bf0d5f50
AE
1993{
1994 struct rbd_device *rbd_dev = q->queuedata;
1995 bool read_only = rbd_dev->mapping.read_only;
1996 struct request *rq;
1997 int result;
1998
1999 while ((rq = blk_fetch_request(q))) {
2000 bool write_request = rq_data_dir(rq) == WRITE;
2001 struct rbd_img_request *img_request;
2002 u64 offset;
2003 u64 length;
2004
2005 /* Ignore any non-FS requests that filter through. */
2006
2007 if (rq->cmd_type != REQ_TYPE_FS) {
4dda41d3
AE
2008 dout("%s: non-fs request type %d\n", __func__,
2009 (int) rq->cmd_type);
2010 __blk_end_request_all(rq, 0);
2011 continue;
2012 }
2013
2014 /* Ignore/skip any zero-length requests */
2015
2016 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
2017 length = (u64) blk_rq_bytes(rq);
2018
2019 if (!length) {
2020 dout("%s: zero-length request\n", __func__);
bf0d5f50
AE
2021 __blk_end_request_all(rq, 0);
2022 continue;
2023 }
2024
2025 spin_unlock_irq(q->queue_lock);
2026
2027 /* Disallow writes to a read-only device */
2028
2029 if (write_request) {
2030 result = -EROFS;
2031 if (read_only)
2032 goto end_request;
2033 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2034 }
2035
6d292906
AE
2036 /*
2037 * Quit early if the mapped snapshot no longer
2038 * exists. It's still possible the snapshot will
2039 * have disappeared by the time our request arrives
2040 * at the osd, but there's no sense in sending it if
2041 * we already know.
2042 */
2043 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
bf0d5f50
AE
2044 dout("request for non-existent snapshot");
2045 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2046 result = -ENXIO;
2047 goto end_request;
2048 }
2049
bf0d5f50
AE
2050 result = -EINVAL;
2051 if (WARN_ON(offset && length > U64_MAX - offset + 1))
2052 goto end_request; /* Shouldn't happen */
2053
2054 result = -ENOMEM;
2055 img_request = rbd_img_request_create(rbd_dev, offset, length,
9849e986 2056 write_request, false);
bf0d5f50
AE
2057 if (!img_request)
2058 goto end_request;
2059
2060 img_request->rq = rq;
2061
2062 result = rbd_img_request_fill_bio(img_request, rq->bio);
2063 if (!result)
2064 result = rbd_img_request_submit(img_request);
2065 if (result)
2066 rbd_img_request_put(img_request);
2067end_request:
2068 spin_lock_irq(q->queue_lock);
2069 if (result < 0) {
7da22d29
AE
2070 rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
2071 write_request ? "write" : "read",
2072 length, offset, result);
2073
bf0d5f50
AE
2074 __blk_end_request_all(rq, result);
2075 }
2076 }
2077}
2078
602adf40
YS
2079/*
2080 * a queue callback. Makes sure that we don't create a bio that spans across
2081 * multiple osd objects. One exception would be with a single page bios,
f7760dad 2082 * which we handle later at bio_chain_clone_range()
602adf40
YS
2083 */
2084static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2085 struct bio_vec *bvec)
2086{
2087 struct rbd_device *rbd_dev = q->queuedata;
e5cfeed2
AE
2088 sector_t sector_offset;
2089 sector_t sectors_per_obj;
2090 sector_t obj_sector_offset;
2091 int ret;
2092
2093 /*
2094 * Find how far into its rbd object the partition-relative
2095 * bio start sector is to offset relative to the enclosing
2096 * device.
2097 */
2098 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2099 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2100 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
2101
2102 /*
2103 * Compute the number of bytes from that offset to the end
2104 * of the object. Account for what's already used by the bio.
2105 */
2106 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2107 if (ret > bmd->bi_size)
2108 ret -= bmd->bi_size;
2109 else
2110 ret = 0;
2111
2112 /*
2113 * Don't send back more than was asked for. And if the bio
2114 * was empty, let the whole thing through because: "Note
2115 * that a block device *must* allow a single page to be
2116 * added to an empty bio."
2117 */
2118 rbd_assert(bvec->bv_len <= PAGE_SIZE);
2119 if (ret > (int) bvec->bv_len || !bmd->bi_size)
2120 ret = (int) bvec->bv_len;
2121
2122 return ret;
602adf40
YS
2123}
2124
2125static void rbd_free_disk(struct rbd_device *rbd_dev)
2126{
2127 struct gendisk *disk = rbd_dev->disk;
2128
2129 if (!disk)
2130 return;
2131
602adf40
YS
2132 if (disk->flags & GENHD_FL_UP)
2133 del_gendisk(disk);
2134 if (disk->queue)
2135 blk_cleanup_queue(disk->queue);
2136 put_disk(disk);
2137}
2138
788e2df3
AE
2139static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2140 const char *object_name,
2141 u64 offset, u64 length,
2142 char *buf, u64 *version)
2143
2144{
2169238d 2145 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
788e2df3 2146 struct rbd_obj_request *obj_request;
788e2df3
AE
2147 struct page **pages = NULL;
2148 u32 page_count;
1ceae7ef 2149 size_t size;
788e2df3
AE
2150 int ret;
2151
2152 page_count = (u32) calc_pages_for(offset, length);
2153 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2154 if (IS_ERR(pages))
2155 ret = PTR_ERR(pages);
2156
2157 ret = -ENOMEM;
2158 obj_request = rbd_obj_request_create(object_name, offset, length,
36be9a76 2159 OBJ_REQUEST_PAGES);
788e2df3
AE
2160 if (!obj_request)
2161 goto out;
2162
2163 obj_request->pages = pages;
2164 obj_request->page_count = page_count;
2165
430c28c3 2166 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
788e2df3
AE
2167 if (!obj_request->osd_req)
2168 goto out;
2169
c99d2d4a
AE
2170 osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2171 offset, length, 0, 0);
a4ce40a9
AE
2172 osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, false,
2173 obj_request->pages,
44cd188d
AE
2174 obj_request->length,
2175 obj_request->offset & ~PAGE_MASK,
2176 false, false);
2fa12320 2177 rbd_osd_req_format(obj_request, false);
430c28c3 2178
788e2df3
AE
2179 ret = rbd_obj_request_submit(osdc, obj_request);
2180 if (ret)
2181 goto out;
2182 ret = rbd_obj_request_wait(obj_request);
2183 if (ret)
2184 goto out;
2185
2186 ret = obj_request->result;
2187 if (ret < 0)
2188 goto out;
1ceae7ef
AE
2189
2190 rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
2191 size = (size_t) obj_request->xferred;
903bb32e 2192 ceph_copy_from_page_vector(pages, buf, 0, size);
23ed6e13
AE
2193 rbd_assert(size <= (size_t) INT_MAX);
2194 ret = (int) size;
788e2df3
AE
2195 if (version)
2196 *version = obj_request->version;
2197out:
2198 if (obj_request)
2199 rbd_obj_request_put(obj_request);
2200 else
2201 ceph_release_page_vector(pages, page_count);
2202
2203 return ret;
2204}
2205
602adf40 2206/*
4156d998
AE
2207 * Read the complete header for the given rbd device.
2208 *
2209 * Returns a pointer to a dynamically-allocated buffer containing
2210 * the complete and validated header. Caller can pass the address
2211 * of a variable that will be filled in with the version of the
2212 * header object at the time it was read.
2213 *
2214 * Returns a pointer-coded errno if a failure occurs.
602adf40 2215 */
4156d998
AE
2216static struct rbd_image_header_ondisk *
2217rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
602adf40 2218{
4156d998 2219 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 2220 u32 snap_count = 0;
4156d998
AE
2221 u64 names_size = 0;
2222 u32 want_count;
2223 int ret;
602adf40 2224
00f1f36f 2225 /*
4156d998
AE
2226 * The complete header will include an array of its 64-bit
2227 * snapshot ids, followed by the names of those snapshots as
2228 * a contiguous block of NUL-terminated strings. Note that
2229 * the number of snapshots could change by the time we read
2230 * it in, in which case we re-read it.
00f1f36f 2231 */
4156d998
AE
2232 do {
2233 size_t size;
2234
2235 kfree(ondisk);
2236
2237 size = sizeof (*ondisk);
2238 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2239 size += names_size;
2240 ondisk = kmalloc(size, GFP_KERNEL);
2241 if (!ondisk)
2242 return ERR_PTR(-ENOMEM);
2243
788e2df3 2244 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
4156d998
AE
2245 0, size,
2246 (char *) ondisk, version);
4156d998
AE
2247 if (ret < 0)
2248 goto out_err;
2249 if (WARN_ON((size_t) ret < size)) {
2250 ret = -ENXIO;
06ecc6cb
AE
2251 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2252 size, ret);
4156d998
AE
2253 goto out_err;
2254 }
2255 if (!rbd_dev_ondisk_valid(ondisk)) {
2256 ret = -ENXIO;
06ecc6cb 2257 rbd_warn(rbd_dev, "invalid header");
4156d998 2258 goto out_err;
81e759fb 2259 }
602adf40 2260
4156d998
AE
2261 names_size = le64_to_cpu(ondisk->snap_names_len);
2262 want_count = snap_count;
2263 snap_count = le32_to_cpu(ondisk->snap_count);
2264 } while (snap_count != want_count);
00f1f36f 2265
4156d998 2266 return ondisk;
00f1f36f 2267
4156d998
AE
2268out_err:
2269 kfree(ondisk);
2270
2271 return ERR_PTR(ret);
2272}
2273
2274/*
2275 * reload the ondisk the header
2276 */
2277static int rbd_read_header(struct rbd_device *rbd_dev,
2278 struct rbd_image_header *header)
2279{
2280 struct rbd_image_header_ondisk *ondisk;
2281 u64 ver = 0;
2282 int ret;
602adf40 2283
4156d998
AE
2284 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
2285 if (IS_ERR(ondisk))
2286 return PTR_ERR(ondisk);
2287 ret = rbd_header_from_disk(header, ondisk);
2288 if (ret >= 0)
2289 header->obj_version = ver;
2290 kfree(ondisk);
2291
2292 return ret;
602adf40
YS
2293}
2294
41f38c2b 2295static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
dfc5606d
YS
2296{
2297 struct rbd_snap *snap;
a0593290 2298 struct rbd_snap *next;
dfc5606d 2299
a0593290 2300 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
41f38c2b 2301 rbd_remove_snap_dev(snap);
dfc5606d
YS
2302}
2303
9478554a
AE
2304static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
2305{
2306 sector_t size;
2307
0d7dbfce 2308 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
9478554a
AE
2309 return;
2310
2311 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
2312 dout("setting size to %llu sectors", (unsigned long long) size);
2313 rbd_dev->mapping.size = (u64) size;
2314 set_capacity(rbd_dev->disk, size);
2315}
2316
602adf40
YS
2317/*
2318 * only read the first part of the ondisk header, without the snaps info
2319 */
117973fb 2320static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
602adf40
YS
2321{
2322 int ret;
2323 struct rbd_image_header h;
602adf40
YS
2324
2325 ret = rbd_read_header(rbd_dev, &h);
2326 if (ret < 0)
2327 return ret;
2328
a51aa0c0
JD
2329 down_write(&rbd_dev->header_rwsem);
2330
9478554a
AE
2331 /* Update image size, and check for resize of mapped image */
2332 rbd_dev->header.image_size = h.image_size;
2333 rbd_update_mapping_size(rbd_dev);
9db4b3e3 2334
849b4260 2335 /* rbd_dev->header.object_prefix shouldn't change */
602adf40 2336 kfree(rbd_dev->header.snap_sizes);
849b4260 2337 kfree(rbd_dev->header.snap_names);
d1d25646
JD
2338 /* osd requests may still refer to snapc */
2339 ceph_put_snap_context(rbd_dev->header.snapc);
602adf40 2340
b813623a
AE
2341 if (hver)
2342 *hver = h.obj_version;
a71b891b 2343 rbd_dev->header.obj_version = h.obj_version;
93a24e08 2344 rbd_dev->header.image_size = h.image_size;
602adf40
YS
2345 rbd_dev->header.snapc = h.snapc;
2346 rbd_dev->header.snap_names = h.snap_names;
2347 rbd_dev->header.snap_sizes = h.snap_sizes;
849b4260
AE
2348 /* Free the extra copy of the object prefix */
2349 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2350 kfree(h.object_prefix);
2351
304f6808
AE
2352 ret = rbd_dev_snaps_update(rbd_dev);
2353 if (!ret)
2354 ret = rbd_dev_snaps_register(rbd_dev);
dfc5606d 2355
c666601a 2356 up_write(&rbd_dev->header_rwsem);
602adf40 2357
dfc5606d 2358 return ret;
602adf40
YS
2359}
2360
117973fb 2361static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
1fe5e993
AE
2362{
2363 int ret;
2364
117973fb 2365 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1fe5e993 2366 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
117973fb
AE
2367 if (rbd_dev->image_format == 1)
2368 ret = rbd_dev_v1_refresh(rbd_dev, hver);
2369 else
2370 ret = rbd_dev_v2_refresh(rbd_dev, hver);
1fe5e993
AE
2371 mutex_unlock(&ctl_mutex);
2372
2373 return ret;
2374}
2375
602adf40
YS
2376static int rbd_init_disk(struct rbd_device *rbd_dev)
2377{
2378 struct gendisk *disk;
2379 struct request_queue *q;
593a9e7b 2380 u64 segment_size;
602adf40 2381
602adf40 2382 /* create gendisk info */
602adf40
YS
2383 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2384 if (!disk)
1fcdb8aa 2385 return -ENOMEM;
602adf40 2386
f0f8cef5 2387 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 2388 rbd_dev->dev_id);
602adf40
YS
2389 disk->major = rbd_dev->major;
2390 disk->first_minor = 0;
2391 disk->fops = &rbd_bd_ops;
2392 disk->private_data = rbd_dev;
2393
bf0d5f50 2394 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
602adf40
YS
2395 if (!q)
2396 goto out_disk;
029bcbd8 2397
593a9e7b
AE
2398 /* We use the default size, but let's be explicit about it. */
2399 blk_queue_physical_block_size(q, SECTOR_SIZE);
2400
029bcbd8 2401 /* set io sizes to object size */
593a9e7b
AE
2402 segment_size = rbd_obj_bytes(&rbd_dev->header);
2403 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2404 blk_queue_max_segment_size(q, segment_size);
2405 blk_queue_io_min(q, segment_size);
2406 blk_queue_io_opt(q, segment_size);
029bcbd8 2407
602adf40
YS
2408 blk_queue_merge_bvec(q, rbd_merge_bvec);
2409 disk->queue = q;
2410
2411 q->queuedata = rbd_dev;
2412
2413 rbd_dev->disk = disk;
602adf40 2414
12f02944
AE
2415 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
2416
602adf40 2417 return 0;
602adf40
YS
2418out_disk:
2419 put_disk(disk);
1fcdb8aa
AE
2420
2421 return -ENOMEM;
602adf40
YS
2422}
2423
dfc5606d
YS
2424/*
2425 sysfs
2426*/
2427
593a9e7b
AE
2428static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2429{
2430 return container_of(dev, struct rbd_device, dev);
2431}
2432
dfc5606d
YS
2433static ssize_t rbd_size_show(struct device *dev,
2434 struct device_attribute *attr, char *buf)
2435{
593a9e7b 2436 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0
JD
2437 sector_t size;
2438
2439 down_read(&rbd_dev->header_rwsem);
2440 size = get_capacity(rbd_dev->disk);
2441 up_read(&rbd_dev->header_rwsem);
dfc5606d 2442
a51aa0c0 2443 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
dfc5606d
YS
2444}
2445
34b13184
AE
2446/*
2447 * Note this shows the features for whatever's mapped, which is not
2448 * necessarily the base image.
2449 */
2450static ssize_t rbd_features_show(struct device *dev,
2451 struct device_attribute *attr, char *buf)
2452{
2453 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2454
2455 return sprintf(buf, "0x%016llx\n",
2456 (unsigned long long) rbd_dev->mapping.features);
2457}
2458
dfc5606d
YS
2459static ssize_t rbd_major_show(struct device *dev,
2460 struct device_attribute *attr, char *buf)
2461{
593a9e7b 2462 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 2463
dfc5606d
YS
2464 return sprintf(buf, "%d\n", rbd_dev->major);
2465}
2466
2467static ssize_t rbd_client_id_show(struct device *dev,
2468 struct device_attribute *attr, char *buf)
602adf40 2469{
593a9e7b 2470 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2471
1dbb4399
AE
2472 return sprintf(buf, "client%lld\n",
2473 ceph_client_id(rbd_dev->rbd_client->client));
602adf40
YS
2474}
2475
dfc5606d
YS
2476static ssize_t rbd_pool_show(struct device *dev,
2477 struct device_attribute *attr, char *buf)
602adf40 2478{
593a9e7b 2479 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2480
0d7dbfce 2481 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
dfc5606d
YS
2482}
2483
9bb2f334
AE
2484static ssize_t rbd_pool_id_show(struct device *dev,
2485 struct device_attribute *attr, char *buf)
2486{
2487 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2488
0d7dbfce
AE
2489 return sprintf(buf, "%llu\n",
2490 (unsigned long long) rbd_dev->spec->pool_id);
9bb2f334
AE
2491}
2492
dfc5606d
YS
2493static ssize_t rbd_name_show(struct device *dev,
2494 struct device_attribute *attr, char *buf)
2495{
593a9e7b 2496 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2497
a92ffdf8
AE
2498 if (rbd_dev->spec->image_name)
2499 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2500
2501 return sprintf(buf, "(unknown)\n");
dfc5606d
YS
2502}
2503
589d30e0
AE
2504static ssize_t rbd_image_id_show(struct device *dev,
2505 struct device_attribute *attr, char *buf)
2506{
2507 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2508
0d7dbfce 2509 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
589d30e0
AE
2510}
2511
34b13184
AE
2512/*
2513 * Shows the name of the currently-mapped snapshot (or
2514 * RBD_SNAP_HEAD_NAME for the base image).
2515 */
dfc5606d
YS
2516static ssize_t rbd_snap_show(struct device *dev,
2517 struct device_attribute *attr,
2518 char *buf)
2519{
593a9e7b 2520 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 2521
0d7dbfce 2522 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
dfc5606d
YS
2523}
2524
86b00e0d
AE
2525/*
2526 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2527 * for the parent image. If there is no parent, simply shows
2528 * "(no parent image)".
2529 */
2530static ssize_t rbd_parent_show(struct device *dev,
2531 struct device_attribute *attr,
2532 char *buf)
2533{
2534 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2535 struct rbd_spec *spec = rbd_dev->parent_spec;
2536 int count;
2537 char *bufp = buf;
2538
2539 if (!spec)
2540 return sprintf(buf, "(no parent image)\n");
2541
2542 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2543 (unsigned long long) spec->pool_id, spec->pool_name);
2544 if (count < 0)
2545 return count;
2546 bufp += count;
2547
2548 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2549 spec->image_name ? spec->image_name : "(unknown)");
2550 if (count < 0)
2551 return count;
2552 bufp += count;
2553
2554 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2555 (unsigned long long) spec->snap_id, spec->snap_name);
2556 if (count < 0)
2557 return count;
2558 bufp += count;
2559
2560 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2561 if (count < 0)
2562 return count;
2563 bufp += count;
2564
2565 return (ssize_t) (bufp - buf);
2566}
2567
dfc5606d
YS
2568static ssize_t rbd_image_refresh(struct device *dev,
2569 struct device_attribute *attr,
2570 const char *buf,
2571 size_t size)
2572{
593a9e7b 2573 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 2574 int ret;
602adf40 2575
117973fb 2576 ret = rbd_dev_refresh(rbd_dev, NULL);
b813623a
AE
2577
2578 return ret < 0 ? ret : size;
dfc5606d 2579}
602adf40 2580
dfc5606d 2581static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
34b13184 2582static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
dfc5606d
YS
2583static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2584static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2585static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
9bb2f334 2586static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
dfc5606d 2587static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
589d30e0 2588static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
dfc5606d
YS
2589static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2590static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
86b00e0d 2591static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
dfc5606d
YS
2592
2593static struct attribute *rbd_attrs[] = {
2594 &dev_attr_size.attr,
34b13184 2595 &dev_attr_features.attr,
dfc5606d
YS
2596 &dev_attr_major.attr,
2597 &dev_attr_client_id.attr,
2598 &dev_attr_pool.attr,
9bb2f334 2599 &dev_attr_pool_id.attr,
dfc5606d 2600 &dev_attr_name.attr,
589d30e0 2601 &dev_attr_image_id.attr,
dfc5606d 2602 &dev_attr_current_snap.attr,
86b00e0d 2603 &dev_attr_parent.attr,
dfc5606d 2604 &dev_attr_refresh.attr,
dfc5606d
YS
2605 NULL
2606};
2607
2608static struct attribute_group rbd_attr_group = {
2609 .attrs = rbd_attrs,
2610};
2611
2612static const struct attribute_group *rbd_attr_groups[] = {
2613 &rbd_attr_group,
2614 NULL
2615};
2616
2617static void rbd_sysfs_dev_release(struct device *dev)
2618{
2619}
2620
2621static struct device_type rbd_device_type = {
2622 .name = "rbd",
2623 .groups = rbd_attr_groups,
2624 .release = rbd_sysfs_dev_release,
2625};
2626
2627
2628/*
2629 sysfs - snapshots
2630*/
2631
2632static ssize_t rbd_snap_size_show(struct device *dev,
2633 struct device_attribute *attr,
2634 char *buf)
2635{
2636 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2637
3591538f 2638 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
dfc5606d
YS
2639}
2640
2641static ssize_t rbd_snap_id_show(struct device *dev,
2642 struct device_attribute *attr,
2643 char *buf)
2644{
2645 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2646
3591538f 2647 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
dfc5606d
YS
2648}
2649
34b13184
AE
2650static ssize_t rbd_snap_features_show(struct device *dev,
2651 struct device_attribute *attr,
2652 char *buf)
2653{
2654 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2655
2656 return sprintf(buf, "0x%016llx\n",
2657 (unsigned long long) snap->features);
2658}
2659
dfc5606d
YS
2660static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2661static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
34b13184 2662static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
dfc5606d
YS
2663
2664static struct attribute *rbd_snap_attrs[] = {
2665 &dev_attr_snap_size.attr,
2666 &dev_attr_snap_id.attr,
34b13184 2667 &dev_attr_snap_features.attr,
dfc5606d
YS
2668 NULL,
2669};
2670
2671static struct attribute_group rbd_snap_attr_group = {
2672 .attrs = rbd_snap_attrs,
2673};
2674
2675static void rbd_snap_dev_release(struct device *dev)
2676{
2677 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2678 kfree(snap->name);
2679 kfree(snap);
2680}
2681
2682static const struct attribute_group *rbd_snap_attr_groups[] = {
2683 &rbd_snap_attr_group,
2684 NULL
2685};
2686
2687static struct device_type rbd_snap_device_type = {
2688 .groups = rbd_snap_attr_groups,
2689 .release = rbd_snap_dev_release,
2690};
2691
8b8fb99c
AE
2692static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2693{
2694 kref_get(&spec->kref);
2695
2696 return spec;
2697}
2698
2699static void rbd_spec_free(struct kref *kref);
2700static void rbd_spec_put(struct rbd_spec *spec)
2701{
2702 if (spec)
2703 kref_put(&spec->kref, rbd_spec_free);
2704}
2705
2706static struct rbd_spec *rbd_spec_alloc(void)
2707{
2708 struct rbd_spec *spec;
2709
2710 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2711 if (!spec)
2712 return NULL;
2713 kref_init(&spec->kref);
2714
2715 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2716
2717 return spec;
2718}
2719
2720static void rbd_spec_free(struct kref *kref)
2721{
2722 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2723
2724 kfree(spec->pool_name);
2725 kfree(spec->image_id);
2726 kfree(spec->image_name);
2727 kfree(spec->snap_name);
2728 kfree(spec);
2729}
2730
cc344fa1 2731static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
c53d5893
AE
2732 struct rbd_spec *spec)
2733{
2734 struct rbd_device *rbd_dev;
2735
2736 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2737 if (!rbd_dev)
2738 return NULL;
2739
2740 spin_lock_init(&rbd_dev->lock);
6d292906 2741 rbd_dev->flags = 0;
c53d5893
AE
2742 INIT_LIST_HEAD(&rbd_dev->node);
2743 INIT_LIST_HEAD(&rbd_dev->snaps);
2744 init_rwsem(&rbd_dev->header_rwsem);
2745
2746 rbd_dev->spec = spec;
2747 rbd_dev->rbd_client = rbdc;
2748
0903e875
AE
2749 /* Initialize the layout used for all rbd requests */
2750
2751 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2752 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2753 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2754 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2755
c53d5893
AE
2756 return rbd_dev;
2757}
2758
2759static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2760{
86b00e0d 2761 rbd_spec_put(rbd_dev->parent_spec);
c53d5893
AE
2762 kfree(rbd_dev->header_name);
2763 rbd_put_client(rbd_dev->rbd_client);
2764 rbd_spec_put(rbd_dev->spec);
2765 kfree(rbd_dev);
2766}
2767
304f6808
AE
2768static bool rbd_snap_registered(struct rbd_snap *snap)
2769{
2770 bool ret = snap->dev.type == &rbd_snap_device_type;
2771 bool reg = device_is_registered(&snap->dev);
2772
2773 rbd_assert(!ret ^ reg);
2774
2775 return ret;
2776}
2777
41f38c2b 2778static void rbd_remove_snap_dev(struct rbd_snap *snap)
dfc5606d
YS
2779{
2780 list_del(&snap->node);
304f6808
AE
2781 if (device_is_registered(&snap->dev))
2782 device_unregister(&snap->dev);
dfc5606d
YS
2783}
2784
14e7085d 2785static int rbd_register_snap_dev(struct rbd_snap *snap,
dfc5606d
YS
2786 struct device *parent)
2787{
2788 struct device *dev = &snap->dev;
2789 int ret;
2790
2791 dev->type = &rbd_snap_device_type;
2792 dev->parent = parent;
2793 dev->release = rbd_snap_dev_release;
d4b125e9 2794 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
304f6808
AE
2795 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2796
dfc5606d
YS
2797 ret = device_register(dev);
2798
2799 return ret;
2800}
2801
4e891e0a 2802static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
c8d18425 2803 const char *snap_name,
34b13184
AE
2804 u64 snap_id, u64 snap_size,
2805 u64 snap_features)
dfc5606d 2806{
4e891e0a 2807 struct rbd_snap *snap;
dfc5606d 2808 int ret;
4e891e0a
AE
2809
2810 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
dfc5606d 2811 if (!snap)
4e891e0a
AE
2812 return ERR_PTR(-ENOMEM);
2813
2814 ret = -ENOMEM;
c8d18425 2815 snap->name = kstrdup(snap_name, GFP_KERNEL);
4e891e0a
AE
2816 if (!snap->name)
2817 goto err;
2818
c8d18425
AE
2819 snap->id = snap_id;
2820 snap->size = snap_size;
34b13184 2821 snap->features = snap_features;
4e891e0a
AE
2822
2823 return snap;
2824
dfc5606d
YS
2825err:
2826 kfree(snap->name);
2827 kfree(snap);
4e891e0a
AE
2828
2829 return ERR_PTR(ret);
dfc5606d
YS
2830}
2831
cd892126
AE
2832static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2833 u64 *snap_size, u64 *snap_features)
2834{
2835 char *snap_name;
2836
2837 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2838
2839 *snap_size = rbd_dev->header.snap_sizes[which];
2840 *snap_features = 0; /* No features for v1 */
2841
2842 /* Skip over names until we find the one we are looking for */
2843
2844 snap_name = rbd_dev->header.snap_names;
2845 while (which--)
2846 snap_name += strlen(snap_name) + 1;
2847
2848 return snap_name;
2849}
2850
9d475de5
AE
2851/*
2852 * Get the size and object order for an image snapshot, or if
2853 * snap_id is CEPH_NOSNAP, gets this information for the base
2854 * image.
2855 */
2856static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2857 u8 *order, u64 *snap_size)
2858{
2859 __le64 snapid = cpu_to_le64(snap_id);
2860 int ret;
2861 struct {
2862 u8 order;
2863 __le64 size;
2864 } __attribute__ ((packed)) size_buf = { 0 };
2865
36be9a76 2866 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
9d475de5
AE
2867 "rbd", "get_size",
2868 (char *) &snapid, sizeof (snapid),
07b2391f 2869 (char *) &size_buf, sizeof (size_buf), NULL);
36be9a76 2870 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
9d475de5
AE
2871 if (ret < 0)
2872 return ret;
2873
2874 *order = size_buf.order;
2875 *snap_size = le64_to_cpu(size_buf.size);
2876
2877 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2878 (unsigned long long) snap_id, (unsigned int) *order,
2879 (unsigned long long) *snap_size);
2880
2881 return 0;
2882}
2883
2884static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2885{
2886 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2887 &rbd_dev->header.obj_order,
2888 &rbd_dev->header.image_size);
2889}
2890
1e130199
AE
2891static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2892{
2893 void *reply_buf;
2894 int ret;
2895 void *p;
2896
2897 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2898 if (!reply_buf)
2899 return -ENOMEM;
2900
36be9a76 2901 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
1e130199
AE
2902 "rbd", "get_object_prefix",
2903 NULL, 0,
07b2391f 2904 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
36be9a76 2905 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
1e130199
AE
2906 if (ret < 0)
2907 goto out;
2908
2909 p = reply_buf;
2910 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2911 p + RBD_OBJ_PREFIX_LEN_MAX,
2912 NULL, GFP_NOIO);
2913
2914 if (IS_ERR(rbd_dev->header.object_prefix)) {
2915 ret = PTR_ERR(rbd_dev->header.object_prefix);
2916 rbd_dev->header.object_prefix = NULL;
2917 } else {
2918 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2919 }
2920
2921out:
2922 kfree(reply_buf);
2923
2924 return ret;
2925}
2926
b1b5402a
AE
2927static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2928 u64 *snap_features)
2929{
2930 __le64 snapid = cpu_to_le64(snap_id);
2931 struct {
2932 __le64 features;
2933 __le64 incompat;
2934 } features_buf = { 0 };
d889140c 2935 u64 incompat;
b1b5402a
AE
2936 int ret;
2937
36be9a76 2938 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
b1b5402a
AE
2939 "rbd", "get_features",
2940 (char *) &snapid, sizeof (snapid),
2941 (char *) &features_buf, sizeof (features_buf),
07b2391f 2942 NULL);
36be9a76 2943 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
b1b5402a
AE
2944 if (ret < 0)
2945 return ret;
d889140c
AE
2946
2947 incompat = le64_to_cpu(features_buf.incompat);
5cbf6f12 2948 if (incompat & ~RBD_FEATURES_SUPPORTED)
b8f5c6ed 2949 return -ENXIO;
d889140c 2950
b1b5402a
AE
2951 *snap_features = le64_to_cpu(features_buf.features);
2952
2953 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2954 (unsigned long long) snap_id,
2955 (unsigned long long) *snap_features,
2956 (unsigned long long) le64_to_cpu(features_buf.incompat));
2957
2958 return 0;
2959}
2960
2961static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2962{
2963 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2964 &rbd_dev->header.features);
2965}
2966
86b00e0d
AE
2967static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2968{
2969 struct rbd_spec *parent_spec;
2970 size_t size;
2971 void *reply_buf = NULL;
2972 __le64 snapid;
2973 void *p;
2974 void *end;
2975 char *image_id;
2976 u64 overlap;
86b00e0d
AE
2977 int ret;
2978
2979 parent_spec = rbd_spec_alloc();
2980 if (!parent_spec)
2981 return -ENOMEM;
2982
2983 size = sizeof (__le64) + /* pool_id */
2984 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2985 sizeof (__le64) + /* snap_id */
2986 sizeof (__le64); /* overlap */
2987 reply_buf = kmalloc(size, GFP_KERNEL);
2988 if (!reply_buf) {
2989 ret = -ENOMEM;
2990 goto out_err;
2991 }
2992
2993 snapid = cpu_to_le64(CEPH_NOSNAP);
36be9a76 2994 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
86b00e0d
AE
2995 "rbd", "get_parent",
2996 (char *) &snapid, sizeof (snapid),
07b2391f 2997 (char *) reply_buf, size, NULL);
36be9a76 2998 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
86b00e0d
AE
2999 if (ret < 0)
3000 goto out_err;
3001
3002 ret = -ERANGE;
3003 p = reply_buf;
3004 end = (char *) reply_buf + size;
3005 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
3006 if (parent_spec->pool_id == CEPH_NOPOOL)
3007 goto out; /* No parent? No problem. */
3008
0903e875
AE
3009 /* The ceph file layout needs to fit pool id in 32 bits */
3010
3011 ret = -EIO;
3012 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
3013 goto out;
3014
979ed480 3015 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
86b00e0d
AE
3016 if (IS_ERR(image_id)) {
3017 ret = PTR_ERR(image_id);
3018 goto out_err;
3019 }
3020 parent_spec->image_id = image_id;
3021 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
3022 ceph_decode_64_safe(&p, end, overlap, out_err);
3023
3024 rbd_dev->parent_overlap = overlap;
3025 rbd_dev->parent_spec = parent_spec;
3026 parent_spec = NULL; /* rbd_dev now owns this */
3027out:
3028 ret = 0;
3029out_err:
3030 kfree(reply_buf);
3031 rbd_spec_put(parent_spec);
3032
3033 return ret;
3034}
3035
9e15b77d
AE
3036static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3037{
3038 size_t image_id_size;
3039 char *image_id;
3040 void *p;
3041 void *end;
3042 size_t size;
3043 void *reply_buf = NULL;
3044 size_t len = 0;
3045 char *image_name = NULL;
3046 int ret;
3047
3048 rbd_assert(!rbd_dev->spec->image_name);
3049
69e7a02f
AE
3050 len = strlen(rbd_dev->spec->image_id);
3051 image_id_size = sizeof (__le32) + len;
9e15b77d
AE
3052 image_id = kmalloc(image_id_size, GFP_KERNEL);
3053 if (!image_id)
3054 return NULL;
3055
3056 p = image_id;
3057 end = (char *) image_id + image_id_size;
69e7a02f 3058 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
9e15b77d
AE
3059
3060 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
3061 reply_buf = kmalloc(size, GFP_KERNEL);
3062 if (!reply_buf)
3063 goto out;
3064
36be9a76 3065 ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
9e15b77d
AE
3066 "rbd", "dir_get_name",
3067 image_id, image_id_size,
07b2391f 3068 (char *) reply_buf, size, NULL);
9e15b77d
AE
3069 if (ret < 0)
3070 goto out;
3071 p = reply_buf;
3072 end = (char *) reply_buf + size;
3073 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
3074 if (IS_ERR(image_name))
3075 image_name = NULL;
3076 else
3077 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
3078out:
3079 kfree(reply_buf);
3080 kfree(image_id);
3081
3082 return image_name;
3083}
3084
3085/*
3086 * When a parent image gets probed, we only have the pool, image,
3087 * and snapshot ids but not the names of any of them. This call
3088 * is made later to fill in those names. It has to be done after
3089 * rbd_dev_snaps_update() has completed because some of the
3090 * information (in particular, snapshot name) is not available
3091 * until then.
3092 */
3093static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
3094{
3095 struct ceph_osd_client *osdc;
3096 const char *name;
3097 void *reply_buf = NULL;
3098 int ret;
3099
3100 if (rbd_dev->spec->pool_name)
3101 return 0; /* Already have the names */
3102
3103 /* Look up the pool name */
3104
3105 osdc = &rbd_dev->rbd_client->client->osdc;
3106 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
935dc89f
AE
3107 if (!name) {
3108 rbd_warn(rbd_dev, "there is no pool with id %llu",
3109 rbd_dev->spec->pool_id); /* Really a BUG() */
3110 return -EIO;
3111 }
9e15b77d
AE
3112
3113 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
3114 if (!rbd_dev->spec->pool_name)
3115 return -ENOMEM;
3116
3117 /* Fetch the image name; tolerate failure here */
3118
3119 name = rbd_dev_image_name(rbd_dev);
69e7a02f 3120 if (name)
9e15b77d 3121 rbd_dev->spec->image_name = (char *) name;
69e7a02f 3122 else
06ecc6cb 3123 rbd_warn(rbd_dev, "unable to get image name");
9e15b77d
AE
3124
3125 /* Look up the snapshot name. */
3126
3127 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
3128 if (!name) {
935dc89f
AE
3129 rbd_warn(rbd_dev, "no snapshot with id %llu",
3130 rbd_dev->spec->snap_id); /* Really a BUG() */
9e15b77d
AE
3131 ret = -EIO;
3132 goto out_err;
3133 }
3134 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
3135 if(!rbd_dev->spec->snap_name)
3136 goto out_err;
3137
3138 return 0;
3139out_err:
3140 kfree(reply_buf);
3141 kfree(rbd_dev->spec->pool_name);
3142 rbd_dev->spec->pool_name = NULL;
3143
3144 return ret;
3145}
3146
6e14b1a6 3147static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
35d489f9
AE
3148{
3149 size_t size;
3150 int ret;
3151 void *reply_buf;
3152 void *p;
3153 void *end;
3154 u64 seq;
3155 u32 snap_count;
3156 struct ceph_snap_context *snapc;
3157 u32 i;
3158
3159 /*
3160 * We'll need room for the seq value (maximum snapshot id),
3161 * snapshot count, and array of that many snapshot ids.
3162 * For now we have a fixed upper limit on the number we're
3163 * prepared to receive.
3164 */
3165 size = sizeof (__le64) + sizeof (__le32) +
3166 RBD_MAX_SNAP_COUNT * sizeof (__le64);
3167 reply_buf = kzalloc(size, GFP_KERNEL);
3168 if (!reply_buf)
3169 return -ENOMEM;
3170
36be9a76 3171 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
35d489f9
AE
3172 "rbd", "get_snapcontext",
3173 NULL, 0,
07b2391f 3174 reply_buf, size, ver);
36be9a76 3175 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
35d489f9
AE
3176 if (ret < 0)
3177 goto out;
3178
3179 ret = -ERANGE;
3180 p = reply_buf;
3181 end = (char *) reply_buf + size;
3182 ceph_decode_64_safe(&p, end, seq, out);
3183 ceph_decode_32_safe(&p, end, snap_count, out);
3184
3185 /*
3186 * Make sure the reported number of snapshot ids wouldn't go
3187 * beyond the end of our buffer. But before checking that,
3188 * make sure the computed size of the snapshot context we
3189 * allocate is representable in a size_t.
3190 */
3191 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3192 / sizeof (u64)) {
3193 ret = -EINVAL;
3194 goto out;
3195 }
3196 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3197 goto out;
3198
3199 size = sizeof (struct ceph_snap_context) +
3200 snap_count * sizeof (snapc->snaps[0]);
3201 snapc = kmalloc(size, GFP_KERNEL);
3202 if (!snapc) {
3203 ret = -ENOMEM;
3204 goto out;
3205 }
3206
3207 atomic_set(&snapc->nref, 1);
3208 snapc->seq = seq;
3209 snapc->num_snaps = snap_count;
3210 for (i = 0; i < snap_count; i++)
3211 snapc->snaps[i] = ceph_decode_64(&p);
3212
3213 rbd_dev->header.snapc = snapc;
3214
3215 dout(" snap context seq = %llu, snap_count = %u\n",
3216 (unsigned long long) seq, (unsigned int) snap_count);
3217
3218out:
3219 kfree(reply_buf);
3220
3221 return 0;
3222}
3223
b8b1e2db
AE
3224static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3225{
3226 size_t size;
3227 void *reply_buf;
3228 __le64 snap_id;
3229 int ret;
3230 void *p;
3231 void *end;
b8b1e2db
AE
3232 char *snap_name;
3233
3234 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3235 reply_buf = kmalloc(size, GFP_KERNEL);
3236 if (!reply_buf)
3237 return ERR_PTR(-ENOMEM);
3238
3239 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
36be9a76 3240 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
b8b1e2db
AE
3241 "rbd", "get_snapshot_name",
3242 (char *) &snap_id, sizeof (snap_id),
07b2391f 3243 reply_buf, size, NULL);
36be9a76 3244 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
b8b1e2db
AE
3245 if (ret < 0)
3246 goto out;
3247
3248 p = reply_buf;
3249 end = (char *) reply_buf + size;
e5c35534 3250 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
b8b1e2db
AE
3251 if (IS_ERR(snap_name)) {
3252 ret = PTR_ERR(snap_name);
3253 goto out;
3254 } else {
3255 dout(" snap_id 0x%016llx snap_name = %s\n",
3256 (unsigned long long) le64_to_cpu(snap_id), snap_name);
3257 }
3258 kfree(reply_buf);
3259
3260 return snap_name;
3261out:
3262 kfree(reply_buf);
3263
3264 return ERR_PTR(ret);
3265}
3266
3267static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3268 u64 *snap_size, u64 *snap_features)
3269{
e0b49868 3270 u64 snap_id;
b8b1e2db
AE
3271 u8 order;
3272 int ret;
3273
3274 snap_id = rbd_dev->header.snapc->snaps[which];
3275 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3276 if (ret)
3277 return ERR_PTR(ret);
3278 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3279 if (ret)
3280 return ERR_PTR(ret);
3281
3282 return rbd_dev_v2_snap_name(rbd_dev, which);
3283}
3284
3285static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3286 u64 *snap_size, u64 *snap_features)
3287{
3288 if (rbd_dev->image_format == 1)
3289 return rbd_dev_v1_snap_info(rbd_dev, which,
3290 snap_size, snap_features);
3291 if (rbd_dev->image_format == 2)
3292 return rbd_dev_v2_snap_info(rbd_dev, which,
3293 snap_size, snap_features);
3294 return ERR_PTR(-EINVAL);
3295}
3296
117973fb
AE
3297static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3298{
3299 int ret;
3300 __u8 obj_order;
3301
3302 down_write(&rbd_dev->header_rwsem);
3303
3304 /* Grab old order first, to see if it changes */
3305
3306 obj_order = rbd_dev->header.obj_order,
3307 ret = rbd_dev_v2_image_size(rbd_dev);
3308 if (ret)
3309 goto out;
3310 if (rbd_dev->header.obj_order != obj_order) {
3311 ret = -EIO;
3312 goto out;
3313 }
3314 rbd_update_mapping_size(rbd_dev);
3315
3316 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3317 dout("rbd_dev_v2_snap_context returned %d\n", ret);
3318 if (ret)
3319 goto out;
3320 ret = rbd_dev_snaps_update(rbd_dev);
3321 dout("rbd_dev_snaps_update returned %d\n", ret);
3322 if (ret)
3323 goto out;
3324 ret = rbd_dev_snaps_register(rbd_dev);
3325 dout("rbd_dev_snaps_register returned %d\n", ret);
3326out:
3327 up_write(&rbd_dev->header_rwsem);
3328
3329 return ret;
3330}
3331
dfc5606d 3332/*
35938150
AE
3333 * Scan the rbd device's current snapshot list and compare it to the
3334 * newly-received snapshot context. Remove any existing snapshots
3335 * not present in the new snapshot context. Add a new snapshot for
3336 * any snaphots in the snapshot context not in the current list.
3337 * And verify there are no changes to snapshots we already know
3338 * about.
3339 *
3340 * Assumes the snapshots in the snapshot context are sorted by
3341 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
3342 * are also maintained in that order.)
dfc5606d 3343 */
304f6808 3344static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
dfc5606d 3345{
35938150
AE
3346 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3347 const u32 snap_count = snapc->num_snaps;
35938150
AE
3348 struct list_head *head = &rbd_dev->snaps;
3349 struct list_head *links = head->next;
3350 u32 index = 0;
dfc5606d 3351
9fcbb800 3352 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
35938150
AE
3353 while (index < snap_count || links != head) {
3354 u64 snap_id;
3355 struct rbd_snap *snap;
cd892126
AE
3356 char *snap_name;
3357 u64 snap_size = 0;
3358 u64 snap_features = 0;
dfc5606d 3359
35938150
AE
3360 snap_id = index < snap_count ? snapc->snaps[index]
3361 : CEPH_NOSNAP;
3362 snap = links != head ? list_entry(links, struct rbd_snap, node)
3363 : NULL;
aafb230e 3364 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
dfc5606d 3365
35938150
AE
3366 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
3367 struct list_head *next = links->next;
dfc5606d 3368
6d292906
AE
3369 /*
3370 * A previously-existing snapshot is not in
3371 * the new snap context.
3372 *
3373 * If the now missing snapshot is the one the
3374 * image is mapped to, clear its exists flag
3375 * so we can avoid sending any more requests
3376 * to it.
3377 */
0d7dbfce 3378 if (rbd_dev->spec->snap_id == snap->id)
6d292906 3379 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
41f38c2b 3380 rbd_remove_snap_dev(snap);
9fcbb800 3381 dout("%ssnap id %llu has been removed\n",
0d7dbfce
AE
3382 rbd_dev->spec->snap_id == snap->id ?
3383 "mapped " : "",
9fcbb800 3384 (unsigned long long) snap->id);
35938150
AE
3385
3386 /* Done with this list entry; advance */
3387
3388 links = next;
dfc5606d
YS
3389 continue;
3390 }
35938150 3391
b8b1e2db
AE
3392 snap_name = rbd_dev_snap_info(rbd_dev, index,
3393 &snap_size, &snap_features);
cd892126
AE
3394 if (IS_ERR(snap_name))
3395 return PTR_ERR(snap_name);
3396
9fcbb800
AE
3397 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
3398 (unsigned long long) snap_id);
35938150
AE
3399 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
3400 struct rbd_snap *new_snap;
3401
3402 /* We haven't seen this snapshot before */
3403
c8d18425 3404 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
cd892126 3405 snap_id, snap_size, snap_features);
9fcbb800
AE
3406 if (IS_ERR(new_snap)) {
3407 int err = PTR_ERR(new_snap);
3408
3409 dout(" failed to add dev, error %d\n", err);
3410
3411 return err;
3412 }
35938150
AE
3413
3414 /* New goes before existing, or at end of list */
3415
9fcbb800 3416 dout(" added dev%s\n", snap ? "" : " at end\n");
35938150
AE
3417 if (snap)
3418 list_add_tail(&new_snap->node, &snap->node);
3419 else
523f3258 3420 list_add_tail(&new_snap->node, head);
35938150
AE
3421 } else {
3422 /* Already have this one */
3423
9fcbb800
AE
3424 dout(" already present\n");
3425
cd892126 3426 rbd_assert(snap->size == snap_size);
aafb230e 3427 rbd_assert(!strcmp(snap->name, snap_name));
cd892126 3428 rbd_assert(snap->features == snap_features);
35938150
AE
3429
3430 /* Done with this list entry; advance */
3431
3432 links = links->next;
dfc5606d 3433 }
35938150
AE
3434
3435 /* Advance to the next entry in the snapshot context */
3436
3437 index++;
dfc5606d 3438 }
9fcbb800 3439 dout("%s: done\n", __func__);
dfc5606d
YS
3440
3441 return 0;
3442}
3443
304f6808
AE
3444/*
3445 * Scan the list of snapshots and register the devices for any that
3446 * have not already been registered.
3447 */
3448static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3449{
3450 struct rbd_snap *snap;
3451 int ret = 0;
3452
37206ee5 3453 dout("%s:\n", __func__);
86ff77bb
AE
3454 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
3455 return -EIO;
304f6808
AE
3456
3457 list_for_each_entry(snap, &rbd_dev->snaps, node) {
3458 if (!rbd_snap_registered(snap)) {
3459 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3460 if (ret < 0)
3461 break;
3462 }
3463 }
3464 dout("%s: returning %d\n", __func__, ret);
3465
3466 return ret;
3467}
3468
dfc5606d
YS
3469static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3470{
dfc5606d 3471 struct device *dev;
cd789ab9 3472 int ret;
dfc5606d
YS
3473
3474 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
dfc5606d 3475
cd789ab9 3476 dev = &rbd_dev->dev;
dfc5606d
YS
3477 dev->bus = &rbd_bus_type;
3478 dev->type = &rbd_device_type;
3479 dev->parent = &rbd_root_dev;
3480 dev->release = rbd_dev_release;
de71a297 3481 dev_set_name(dev, "%d", rbd_dev->dev_id);
dfc5606d 3482 ret = device_register(dev);
dfc5606d 3483
dfc5606d 3484 mutex_unlock(&ctl_mutex);
cd789ab9 3485
dfc5606d 3486 return ret;
602adf40
YS
3487}
3488
dfc5606d
YS
3489static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3490{
3491 device_unregister(&rbd_dev->dev);
3492}
3493
e2839308 3494static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
1ddbe94e
AE
3495
3496/*
499afd5b
AE
3497 * Get a unique rbd identifier for the given new rbd_dev, and add
3498 * the rbd_dev to the global list. The minimum rbd id is 1.
1ddbe94e 3499 */
e2839308 3500static void rbd_dev_id_get(struct rbd_device *rbd_dev)
b7f23c36 3501{
e2839308 3502 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
499afd5b
AE
3503
3504 spin_lock(&rbd_dev_list_lock);
3505 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3506 spin_unlock(&rbd_dev_list_lock);
e2839308
AE
3507 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3508 (unsigned long long) rbd_dev->dev_id);
1ddbe94e 3509}
b7f23c36 3510
1ddbe94e 3511/*
499afd5b
AE
3512 * Remove an rbd_dev from the global list, and record that its
3513 * identifier is no longer in use.
1ddbe94e 3514 */
e2839308 3515static void rbd_dev_id_put(struct rbd_device *rbd_dev)
1ddbe94e 3516{
d184f6bf 3517 struct list_head *tmp;
de71a297 3518 int rbd_id = rbd_dev->dev_id;
d184f6bf
AE
3519 int max_id;
3520
aafb230e 3521 rbd_assert(rbd_id > 0);
499afd5b 3522
e2839308
AE
3523 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3524 (unsigned long long) rbd_dev->dev_id);
499afd5b
AE
3525 spin_lock(&rbd_dev_list_lock);
3526 list_del_init(&rbd_dev->node);
d184f6bf
AE
3527
3528 /*
3529 * If the id being "put" is not the current maximum, there
3530 * is nothing special we need to do.
3531 */
e2839308 3532 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
d184f6bf
AE
3533 spin_unlock(&rbd_dev_list_lock);
3534 return;
3535 }
3536
3537 /*
3538 * We need to update the current maximum id. Search the
3539 * list to find out what it is. We're more likely to find
3540 * the maximum at the end, so search the list backward.
3541 */
3542 max_id = 0;
3543 list_for_each_prev(tmp, &rbd_dev_list) {
3544 struct rbd_device *rbd_dev;
3545
3546 rbd_dev = list_entry(tmp, struct rbd_device, node);
b213e0b1
AE
3547 if (rbd_dev->dev_id > max_id)
3548 max_id = rbd_dev->dev_id;
d184f6bf 3549 }
499afd5b 3550 spin_unlock(&rbd_dev_list_lock);
b7f23c36 3551
1ddbe94e 3552 /*
e2839308 3553 * The max id could have been updated by rbd_dev_id_get(), in
d184f6bf
AE
3554 * which case it now accurately reflects the new maximum.
3555 * Be careful not to overwrite the maximum value in that
3556 * case.
1ddbe94e 3557 */
e2839308
AE
3558 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3559 dout(" max dev id has been reset\n");
b7f23c36
AE
3560}
3561
e28fff26
AE
3562/*
3563 * Skips over white space at *buf, and updates *buf to point to the
3564 * first found non-space character (if any). Returns the length of
593a9e7b
AE
3565 * the token (string of non-white space characters) found. Note
3566 * that *buf must be terminated with '\0'.
e28fff26
AE
3567 */
3568static inline size_t next_token(const char **buf)
3569{
3570 /*
3571 * These are the characters that produce nonzero for
3572 * isspace() in the "C" and "POSIX" locales.
3573 */
3574 const char *spaces = " \f\n\r\t\v";
3575
3576 *buf += strspn(*buf, spaces); /* Find start of token */
3577
3578 return strcspn(*buf, spaces); /* Return token length */
3579}
3580
3581/*
3582 * Finds the next token in *buf, and if the provided token buffer is
3583 * big enough, copies the found token into it. The result, if
593a9e7b
AE
3584 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3585 * must be terminated with '\0' on entry.
e28fff26
AE
3586 *
3587 * Returns the length of the token found (not including the '\0').
3588 * Return value will be 0 if no token is found, and it will be >=
3589 * token_size if the token would not fit.
3590 *
593a9e7b 3591 * The *buf pointer will be updated to point beyond the end of the
e28fff26
AE
3592 * found token. Note that this occurs even if the token buffer is
3593 * too small to hold it.
3594 */
3595static inline size_t copy_token(const char **buf,
3596 char *token,
3597 size_t token_size)
3598{
3599 size_t len;
3600
3601 len = next_token(buf);
3602 if (len < token_size) {
3603 memcpy(token, *buf, len);
3604 *(token + len) = '\0';
3605 }
3606 *buf += len;
3607
3608 return len;
3609}
3610
ea3352f4
AE
3611/*
3612 * Finds the next token in *buf, dynamically allocates a buffer big
3613 * enough to hold a copy of it, and copies the token into the new
3614 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3615 * that a duplicate buffer is created even for a zero-length token.
3616 *
3617 * Returns a pointer to the newly-allocated duplicate, or a null
3618 * pointer if memory for the duplicate was not available. If
3619 * the lenp argument is a non-null pointer, the length of the token
3620 * (not including the '\0') is returned in *lenp.
3621 *
3622 * If successful, the *buf pointer will be updated to point beyond
3623 * the end of the found token.
3624 *
3625 * Note: uses GFP_KERNEL for allocation.
3626 */
3627static inline char *dup_token(const char **buf, size_t *lenp)
3628{
3629 char *dup;
3630 size_t len;
3631
3632 len = next_token(buf);
4caf35f9 3633 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
ea3352f4
AE
3634 if (!dup)
3635 return NULL;
ea3352f4
AE
3636 *(dup + len) = '\0';
3637 *buf += len;
3638
3639 if (lenp)
3640 *lenp = len;
3641
3642 return dup;
3643}
3644
a725f65e 3645/*
859c31df
AE
3646 * Parse the options provided for an "rbd add" (i.e., rbd image
3647 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3648 * and the data written is passed here via a NUL-terminated buffer.
3649 * Returns 0 if successful or an error code otherwise.
d22f76e7 3650 *
859c31df
AE
3651 * The information extracted from these options is recorded in
3652 * the other parameters which return dynamically-allocated
3653 * structures:
3654 * ceph_opts
3655 * The address of a pointer that will refer to a ceph options
3656 * structure. Caller must release the returned pointer using
3657 * ceph_destroy_options() when it is no longer needed.
3658 * rbd_opts
3659 * Address of an rbd options pointer. Fully initialized by
3660 * this function; caller must release with kfree().
3661 * spec
3662 * Address of an rbd image specification pointer. Fully
3663 * initialized by this function based on parsed options.
3664 * Caller must release with rbd_spec_put().
3665 *
3666 * The options passed take this form:
3667 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3668 * where:
3669 * <mon_addrs>
3670 * A comma-separated list of one or more monitor addresses.
3671 * A monitor address is an ip address, optionally followed
3672 * by a port number (separated by a colon).
3673 * I.e.: ip1[:port1][,ip2[:port2]...]
3674 * <options>
3675 * A comma-separated list of ceph and/or rbd options.
3676 * <pool_name>
3677 * The name of the rados pool containing the rbd image.
3678 * <image_name>
3679 * The name of the image in that pool to map.
3680 * <snap_id>
3681 * An optional snapshot id. If provided, the mapping will
3682 * present data from the image at the time that snapshot was
3683 * created. The image head is used if no snapshot id is
3684 * provided. Snapshot mappings are always read-only.
a725f65e 3685 */
859c31df 3686static int rbd_add_parse_args(const char *buf,
dc79b113 3687 struct ceph_options **ceph_opts,
859c31df
AE
3688 struct rbd_options **opts,
3689 struct rbd_spec **rbd_spec)
e28fff26 3690{
d22f76e7 3691 size_t len;
859c31df 3692 char *options;
0ddebc0c
AE
3693 const char *mon_addrs;
3694 size_t mon_addrs_size;
859c31df 3695 struct rbd_spec *spec = NULL;
4e9afeba 3696 struct rbd_options *rbd_opts = NULL;
859c31df 3697 struct ceph_options *copts;
dc79b113 3698 int ret;
e28fff26
AE
3699
3700 /* The first four tokens are required */
3701
7ef3214a 3702 len = next_token(&buf);
4fb5d671
AE
3703 if (!len) {
3704 rbd_warn(NULL, "no monitor address(es) provided");
3705 return -EINVAL;
3706 }
0ddebc0c 3707 mon_addrs = buf;
f28e565a 3708 mon_addrs_size = len + 1;
7ef3214a 3709 buf += len;
a725f65e 3710
dc79b113 3711 ret = -EINVAL;
f28e565a
AE
3712 options = dup_token(&buf, NULL);
3713 if (!options)
dc79b113 3714 return -ENOMEM;
4fb5d671
AE
3715 if (!*options) {
3716 rbd_warn(NULL, "no options provided");
3717 goto out_err;
3718 }
e28fff26 3719
859c31df
AE
3720 spec = rbd_spec_alloc();
3721 if (!spec)
f28e565a 3722 goto out_mem;
859c31df
AE
3723
3724 spec->pool_name = dup_token(&buf, NULL);
3725 if (!spec->pool_name)
3726 goto out_mem;
4fb5d671
AE
3727 if (!*spec->pool_name) {
3728 rbd_warn(NULL, "no pool name provided");
3729 goto out_err;
3730 }
e28fff26 3731
69e7a02f 3732 spec->image_name = dup_token(&buf, NULL);
859c31df 3733 if (!spec->image_name)
f28e565a 3734 goto out_mem;
4fb5d671
AE
3735 if (!*spec->image_name) {
3736 rbd_warn(NULL, "no image name provided");
3737 goto out_err;
3738 }
d4b125e9 3739
f28e565a
AE
3740 /*
3741 * Snapshot name is optional; default is to use "-"
3742 * (indicating the head/no snapshot).
3743 */
3feeb894 3744 len = next_token(&buf);
820a5f3e 3745 if (!len) {
3feeb894
AE
3746 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3747 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
f28e565a 3748 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
dc79b113 3749 ret = -ENAMETOOLONG;
f28e565a 3750 goto out_err;
849b4260 3751 }
4caf35f9 3752 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
859c31df 3753 if (!spec->snap_name)
f28e565a 3754 goto out_mem;
859c31df 3755 *(spec->snap_name + len) = '\0';
e5c35534 3756
0ddebc0c 3757 /* Initialize all rbd options to the defaults */
e28fff26 3758
4e9afeba
AE
3759 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3760 if (!rbd_opts)
3761 goto out_mem;
3762
3763 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
d22f76e7 3764
859c31df 3765 copts = ceph_parse_options(options, mon_addrs,
0ddebc0c 3766 mon_addrs + mon_addrs_size - 1,
4e9afeba 3767 parse_rbd_opts_token, rbd_opts);
859c31df
AE
3768 if (IS_ERR(copts)) {
3769 ret = PTR_ERR(copts);
dc79b113
AE
3770 goto out_err;
3771 }
859c31df
AE
3772 kfree(options);
3773
3774 *ceph_opts = copts;
4e9afeba 3775 *opts = rbd_opts;
859c31df 3776 *rbd_spec = spec;
0ddebc0c 3777
dc79b113 3778 return 0;
f28e565a 3779out_mem:
dc79b113 3780 ret = -ENOMEM;
d22f76e7 3781out_err:
859c31df
AE
3782 kfree(rbd_opts);
3783 rbd_spec_put(spec);
f28e565a 3784 kfree(options);
d22f76e7 3785
dc79b113 3786 return ret;
a725f65e
AE
3787}
3788
589d30e0
AE
3789/*
3790 * An rbd format 2 image has a unique identifier, distinct from the
3791 * name given to it by the user. Internally, that identifier is
3792 * what's used to specify the names of objects related to the image.
3793 *
3794 * A special "rbd id" object is used to map an rbd image name to its
3795 * id. If that object doesn't exist, then there is no v2 rbd image
3796 * with the supplied name.
3797 *
3798 * This function will record the given rbd_dev's image_id field if
3799 * it can be determined, and in that case will return 0. If any
3800 * errors occur a negative errno will be returned and the rbd_dev's
3801 * image_id field will be unchanged (and should be NULL).
3802 */
3803static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3804{
3805 int ret;
3806 size_t size;
3807 char *object_name;
3808 void *response;
3809 void *p;
3810
2c0d0a10
AE
3811 /*
3812 * When probing a parent image, the image id is already
3813 * known (and the image name likely is not). There's no
3814 * need to fetch the image id again in this case.
3815 */
3816 if (rbd_dev->spec->image_id)
3817 return 0;
3818
589d30e0
AE
3819 /*
3820 * First, see if the format 2 image id file exists, and if
3821 * so, get the image's persistent id from it.
3822 */
69e7a02f 3823 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
589d30e0
AE
3824 object_name = kmalloc(size, GFP_NOIO);
3825 if (!object_name)
3826 return -ENOMEM;
0d7dbfce 3827 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
589d30e0
AE
3828 dout("rbd id object name is %s\n", object_name);
3829
3830 /* Response will be an encoded string, which includes a length */
3831
3832 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3833 response = kzalloc(size, GFP_NOIO);
3834 if (!response) {
3835 ret = -ENOMEM;
3836 goto out;
3837 }
3838
36be9a76 3839 ret = rbd_obj_method_sync(rbd_dev, object_name,
589d30e0
AE
3840 "rbd", "get_id",
3841 NULL, 0,
07b2391f 3842 response, RBD_IMAGE_ID_LEN_MAX, NULL);
36be9a76 3843 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
589d30e0
AE
3844 if (ret < 0)
3845 goto out;
3846
3847 p = response;
0d7dbfce 3848 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
589d30e0 3849 p + RBD_IMAGE_ID_LEN_MAX,
979ed480 3850 NULL, GFP_NOIO);
0d7dbfce
AE
3851 if (IS_ERR(rbd_dev->spec->image_id)) {
3852 ret = PTR_ERR(rbd_dev->spec->image_id);
3853 rbd_dev->spec->image_id = NULL;
589d30e0 3854 } else {
0d7dbfce 3855 dout("image_id is %s\n", rbd_dev->spec->image_id);
589d30e0
AE
3856 }
3857out:
3858 kfree(response);
3859 kfree(object_name);
3860
3861 return ret;
3862}
3863
a30b71b9
AE
3864static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3865{
3866 int ret;
3867 size_t size;
3868
3869 /* Version 1 images have no id; empty string is used */
3870
0d7dbfce
AE
3871 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3872 if (!rbd_dev->spec->image_id)
a30b71b9 3873 return -ENOMEM;
a30b71b9
AE
3874
3875 /* Record the header object name for this rbd image. */
3876
69e7a02f 3877 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
a30b71b9
AE
3878 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3879 if (!rbd_dev->header_name) {
3880 ret = -ENOMEM;
3881 goto out_err;
3882 }
0d7dbfce
AE
3883 sprintf(rbd_dev->header_name, "%s%s",
3884 rbd_dev->spec->image_name, RBD_SUFFIX);
a30b71b9
AE
3885
3886 /* Populate rbd image metadata */
3887
3888 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3889 if (ret < 0)
3890 goto out_err;
86b00e0d
AE
3891
3892 /* Version 1 images have no parent (no layering) */
3893
3894 rbd_dev->parent_spec = NULL;
3895 rbd_dev->parent_overlap = 0;
3896
a30b71b9
AE
3897 rbd_dev->image_format = 1;
3898
3899 dout("discovered version 1 image, header name is %s\n",
3900 rbd_dev->header_name);
3901
3902 return 0;
3903
3904out_err:
3905 kfree(rbd_dev->header_name);
3906 rbd_dev->header_name = NULL;
0d7dbfce
AE
3907 kfree(rbd_dev->spec->image_id);
3908 rbd_dev->spec->image_id = NULL;
a30b71b9
AE
3909
3910 return ret;
3911}
3912
3913static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3914{
3915 size_t size;
9d475de5 3916 int ret;
6e14b1a6 3917 u64 ver = 0;
a30b71b9
AE
3918
3919 /*
3920 * Image id was filled in by the caller. Record the header
3921 * object name for this rbd image.
3922 */
979ed480 3923 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
a30b71b9
AE
3924 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3925 if (!rbd_dev->header_name)
3926 return -ENOMEM;
3927 sprintf(rbd_dev->header_name, "%s%s",
0d7dbfce 3928 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
9d475de5
AE
3929
3930 /* Get the size and object order for the image */
3931
3932 ret = rbd_dev_v2_image_size(rbd_dev);
1e130199
AE
3933 if (ret < 0)
3934 goto out_err;
3935
3936 /* Get the object prefix (a.k.a. block_name) for the image */
3937
3938 ret = rbd_dev_v2_object_prefix(rbd_dev);
b1b5402a
AE
3939 if (ret < 0)
3940 goto out_err;
3941
d889140c 3942 /* Get the and check features for the image */
b1b5402a
AE
3943
3944 ret = rbd_dev_v2_features(rbd_dev);
9d475de5
AE
3945 if (ret < 0)
3946 goto out_err;
35d489f9 3947
86b00e0d
AE
3948 /* If the image supports layering, get the parent info */
3949
3950 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3951 ret = rbd_dev_v2_parent_info(rbd_dev);
3952 if (ret < 0)
3953 goto out_err;
3954 }
3955
6e14b1a6
AE
3956 /* crypto and compression type aren't (yet) supported for v2 images */
3957
3958 rbd_dev->header.crypt_type = 0;
3959 rbd_dev->header.comp_type = 0;
35d489f9 3960
6e14b1a6
AE
3961 /* Get the snapshot context, plus the header version */
3962
3963 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
35d489f9
AE
3964 if (ret)
3965 goto out_err;
6e14b1a6
AE
3966 rbd_dev->header.obj_version = ver;
3967
a30b71b9
AE
3968 rbd_dev->image_format = 2;
3969
3970 dout("discovered version 2 image, header name is %s\n",
3971 rbd_dev->header_name);
3972
35152979 3973 return 0;
9d475de5 3974out_err:
86b00e0d
AE
3975 rbd_dev->parent_overlap = 0;
3976 rbd_spec_put(rbd_dev->parent_spec);
3977 rbd_dev->parent_spec = NULL;
9d475de5
AE
3978 kfree(rbd_dev->header_name);
3979 rbd_dev->header_name = NULL;
1e130199
AE
3980 kfree(rbd_dev->header.object_prefix);
3981 rbd_dev->header.object_prefix = NULL;
9d475de5
AE
3982
3983 return ret;
a30b71b9
AE
3984}
3985
83a06263
AE
3986static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3987{
3988 int ret;
3989
3990 /* no need to lock here, as rbd_dev is not registered yet */
3991 ret = rbd_dev_snaps_update(rbd_dev);
3992 if (ret)
3993 return ret;
3994
9e15b77d
AE
3995 ret = rbd_dev_probe_update_spec(rbd_dev);
3996 if (ret)
3997 goto err_out_snaps;
3998
83a06263
AE
3999 ret = rbd_dev_set_mapping(rbd_dev);
4000 if (ret)
4001 goto err_out_snaps;
4002
4003 /* generate unique id: find highest unique id, add one */
4004 rbd_dev_id_get(rbd_dev);
4005
4006 /* Fill in the device name, now that we have its id. */
4007 BUILD_BUG_ON(DEV_NAME_LEN
4008 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
4009 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
4010
4011 /* Get our block major device number. */
4012
4013 ret = register_blkdev(0, rbd_dev->name);
4014 if (ret < 0)
4015 goto err_out_id;
4016 rbd_dev->major = ret;
4017
4018 /* Set up the blkdev mapping. */
4019
4020 ret = rbd_init_disk(rbd_dev);
4021 if (ret)
4022 goto err_out_blkdev;
4023
4024 ret = rbd_bus_add_dev(rbd_dev);
4025 if (ret)
4026 goto err_out_disk;
4027
4028 /*
4029 * At this point cleanup in the event of an error is the job
4030 * of the sysfs code (initiated by rbd_bus_del_dev()).
4031 */
4032 down_write(&rbd_dev->header_rwsem);
4033 ret = rbd_dev_snaps_register(rbd_dev);
4034 up_write(&rbd_dev->header_rwsem);
4035 if (ret)
4036 goto err_out_bus;
4037
9969ebc5 4038 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
83a06263
AE
4039 if (ret)
4040 goto err_out_bus;
4041
4042 /* Everything's ready. Announce the disk to the world. */
4043
4044 add_disk(rbd_dev->disk);
4045
4046 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4047 (unsigned long long) rbd_dev->mapping.size);
4048
4049 return ret;
4050err_out_bus:
4051 /* this will also clean up rest of rbd_dev stuff */
4052
4053 rbd_bus_del_dev(rbd_dev);
4054
4055 return ret;
4056err_out_disk:
4057 rbd_free_disk(rbd_dev);
4058err_out_blkdev:
4059 unregister_blkdev(rbd_dev->major, rbd_dev->name);
4060err_out_id:
4061 rbd_dev_id_put(rbd_dev);
4062err_out_snaps:
4063 rbd_remove_all_snaps(rbd_dev);
4064
4065 return ret;
4066}
4067
a30b71b9
AE
4068/*
4069 * Probe for the existence of the header object for the given rbd
4070 * device. For format 2 images this includes determining the image
4071 * id.
4072 */
4073static int rbd_dev_probe(struct rbd_device *rbd_dev)
4074{
4075 int ret;
4076
4077 /*
4078 * Get the id from the image id object. If it's not a
4079 * format 2 image, we'll get ENOENT back, and we'll assume
4080 * it's a format 1 image.
4081 */
4082 ret = rbd_dev_image_id(rbd_dev);
4083 if (ret)
4084 ret = rbd_dev_v1_probe(rbd_dev);
4085 else
4086 ret = rbd_dev_v2_probe(rbd_dev);
83a06263 4087 if (ret) {
a30b71b9
AE
4088 dout("probe failed, returning %d\n", ret);
4089
83a06263
AE
4090 return ret;
4091 }
4092
4093 ret = rbd_dev_probe_finish(rbd_dev);
4094 if (ret)
4095 rbd_header_free(&rbd_dev->header);
4096
a30b71b9
AE
4097 return ret;
4098}
4099
59c2be1e
YS
4100static ssize_t rbd_add(struct bus_type *bus,
4101 const char *buf,
4102 size_t count)
602adf40 4103{
cb8627c7 4104 struct rbd_device *rbd_dev = NULL;
dc79b113 4105 struct ceph_options *ceph_opts = NULL;
4e9afeba 4106 struct rbd_options *rbd_opts = NULL;
859c31df 4107 struct rbd_spec *spec = NULL;
9d3997fd 4108 struct rbd_client *rbdc;
27cc2594
AE
4109 struct ceph_osd_client *osdc;
4110 int rc = -ENOMEM;
602adf40
YS
4111
4112 if (!try_module_get(THIS_MODULE))
4113 return -ENODEV;
4114
602adf40 4115 /* parse add command */
859c31df 4116 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
dc79b113 4117 if (rc < 0)
bd4ba655 4118 goto err_out_module;
78cea76e 4119
9d3997fd
AE
4120 rbdc = rbd_get_client(ceph_opts);
4121 if (IS_ERR(rbdc)) {
4122 rc = PTR_ERR(rbdc);
0ddebc0c 4123 goto err_out_args;
9d3997fd 4124 }
c53d5893 4125 ceph_opts = NULL; /* rbd_dev client now owns this */
602adf40 4126
602adf40 4127 /* pick the pool */
9d3997fd 4128 osdc = &rbdc->client->osdc;
859c31df 4129 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
602adf40
YS
4130 if (rc < 0)
4131 goto err_out_client;
859c31df
AE
4132 spec->pool_id = (u64) rc;
4133
0903e875
AE
4134 /* The ceph file layout needs to fit pool id in 32 bits */
4135
4136 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
4137 rc = -EIO;
4138 goto err_out_client;
4139 }
4140
c53d5893 4141 rbd_dev = rbd_dev_create(rbdc, spec);
bd4ba655
AE
4142 if (!rbd_dev)
4143 goto err_out_client;
c53d5893
AE
4144 rbdc = NULL; /* rbd_dev now owns this */
4145 spec = NULL; /* rbd_dev now owns this */
602adf40 4146
bd4ba655 4147 rbd_dev->mapping.read_only = rbd_opts->read_only;
c53d5893
AE
4148 kfree(rbd_opts);
4149 rbd_opts = NULL; /* done with this */
bd4ba655 4150
a30b71b9
AE
4151 rc = rbd_dev_probe(rbd_dev);
4152 if (rc < 0)
c53d5893 4153 goto err_out_rbd_dev;
05fd6f6f 4154
602adf40 4155 return count;
c53d5893
AE
4156err_out_rbd_dev:
4157 rbd_dev_destroy(rbd_dev);
bd4ba655 4158err_out_client:
9d3997fd 4159 rbd_put_client(rbdc);
0ddebc0c 4160err_out_args:
78cea76e
AE
4161 if (ceph_opts)
4162 ceph_destroy_options(ceph_opts);
4e9afeba 4163 kfree(rbd_opts);
859c31df 4164 rbd_spec_put(spec);
bd4ba655
AE
4165err_out_module:
4166 module_put(THIS_MODULE);
27cc2594 4167
602adf40 4168 dout("Error adding device %s\n", buf);
27cc2594
AE
4169
4170 return (ssize_t) rc;
602adf40
YS
4171}
4172
de71a297 4173static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
602adf40
YS
4174{
4175 struct list_head *tmp;
4176 struct rbd_device *rbd_dev;
4177
e124a82f 4178 spin_lock(&rbd_dev_list_lock);
602adf40
YS
4179 list_for_each(tmp, &rbd_dev_list) {
4180 rbd_dev = list_entry(tmp, struct rbd_device, node);
de71a297 4181 if (rbd_dev->dev_id == dev_id) {
e124a82f 4182 spin_unlock(&rbd_dev_list_lock);
602adf40 4183 return rbd_dev;
e124a82f 4184 }
602adf40 4185 }
e124a82f 4186 spin_unlock(&rbd_dev_list_lock);
602adf40
YS
4187 return NULL;
4188}
4189
dfc5606d 4190static void rbd_dev_release(struct device *dev)
602adf40 4191{
593a9e7b 4192 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 4193
59c2be1e 4194 if (rbd_dev->watch_event)
9969ebc5 4195 rbd_dev_header_watch_sync(rbd_dev, 0);
602adf40
YS
4196
4197 /* clean up and free blkdev */
4198 rbd_free_disk(rbd_dev);
4199 unregister_blkdev(rbd_dev->major, rbd_dev->name);
32eec68d 4200
2ac4e75d
AE
4201 /* release allocated disk header fields */
4202 rbd_header_free(&rbd_dev->header);
4203
32eec68d 4204 /* done with the id, and with the rbd_dev */
e2839308 4205 rbd_dev_id_put(rbd_dev);
c53d5893
AE
4206 rbd_assert(rbd_dev->rbd_client != NULL);
4207 rbd_dev_destroy(rbd_dev);
602adf40
YS
4208
4209 /* release module ref */
4210 module_put(THIS_MODULE);
602adf40
YS
4211}
4212
dfc5606d
YS
4213static ssize_t rbd_remove(struct bus_type *bus,
4214 const char *buf,
4215 size_t count)
602adf40
YS
4216{
4217 struct rbd_device *rbd_dev = NULL;
4218 int target_id, rc;
4219 unsigned long ul;
4220 int ret = count;
4221
4222 rc = strict_strtoul(buf, 10, &ul);
4223 if (rc)
4224 return rc;
4225
4226 /* convert to int; abort if we lost anything in the conversion */
4227 target_id = (int) ul;
4228 if (target_id != ul)
4229 return -EINVAL;
4230
4231 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4232
4233 rbd_dev = __rbd_get_dev(target_id);
4234 if (!rbd_dev) {
4235 ret = -ENOENT;
4236 goto done;
42382b70
AE
4237 }
4238
a14ea269 4239 spin_lock_irq(&rbd_dev->lock);
b82d167b 4240 if (rbd_dev->open_count)
42382b70 4241 ret = -EBUSY;
b82d167b
AE
4242 else
4243 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
a14ea269 4244 spin_unlock_irq(&rbd_dev->lock);
b82d167b 4245 if (ret < 0)
42382b70 4246 goto done;
602adf40 4247
41f38c2b 4248 rbd_remove_all_snaps(rbd_dev);
dfc5606d 4249 rbd_bus_del_dev(rbd_dev);
602adf40
YS
4250
4251done:
4252 mutex_unlock(&ctl_mutex);
aafb230e 4253
602adf40
YS
4254 return ret;
4255}
4256
602adf40
YS
4257/*
4258 * create control files in sysfs
dfc5606d 4259 * /sys/bus/rbd/...
602adf40
YS
4260 */
4261static int rbd_sysfs_init(void)
4262{
dfc5606d 4263 int ret;
602adf40 4264
fed4c143 4265 ret = device_register(&rbd_root_dev);
21079786 4266 if (ret < 0)
dfc5606d 4267 return ret;
602adf40 4268
fed4c143
AE
4269 ret = bus_register(&rbd_bus_type);
4270 if (ret < 0)
4271 device_unregister(&rbd_root_dev);
602adf40 4272
602adf40
YS
4273 return ret;
4274}
4275
4276static void rbd_sysfs_cleanup(void)
4277{
dfc5606d 4278 bus_unregister(&rbd_bus_type);
fed4c143 4279 device_unregister(&rbd_root_dev);
602adf40
YS
4280}
4281
cc344fa1 4282static int __init rbd_init(void)
602adf40
YS
4283{
4284 int rc;
4285
1e32d34c
AE
4286 if (!libceph_compatible(NULL)) {
4287 rbd_warn(NULL, "libceph incompatibility (quitting)");
4288
4289 return -EINVAL;
4290 }
602adf40
YS
4291 rc = rbd_sysfs_init();
4292 if (rc)
4293 return rc;
f0f8cef5 4294 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
602adf40
YS
4295 return 0;
4296}
4297
cc344fa1 4298static void __exit rbd_exit(void)
602adf40
YS
4299{
4300 rbd_sysfs_cleanup();
4301}
4302
4303module_init(rbd_init);
4304module_exit(rbd_exit);
4305
4306MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4307MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4308MODULE_DESCRIPTION("rados block device");
4309
4310/* following authorship retained from original osdblk.c */
4311MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4312
4313MODULE_LICENSE("GPL");