]> git.ipfire.org Git - people/arne_f/kernel.git/blame - drivers/block/rbd.c
rbd: add code to get the size of a v2 rbd image
[people/arne_f/kernel.git] / drivers / block / rbd.c
CommitLineData
602adf40
YS
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
dfc5606d 24 For usage instructions, please refer to:
602adf40 25
dfc5606d 26 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
59c2be1e 34#include <linux/parser.h>
602adf40
YS
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
aafb230e
AE
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
593a9e7b
AE
46/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
df111be6
AE
55/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
f0f8cef5
AE
59#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
602adf40
YS
61
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
602adf40
YS
64#define RBD_MAX_SNAP_NAME_LEN 32
65#define RBD_MAX_OPT_LEN 1024
66
67#define RBD_SNAP_HEAD_NAME "-"
68
589d30e0
AE
69#define RBD_IMAGE_ID_LEN_MAX 64
70
81a89793
AE
71/*
72 * An RBD device name will be "rbd#", where the "rbd" comes from
73 * RBD_DRV_NAME above, and # is a unique integer identifier.
74 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
75 * enough to hold all possible device names.
76 */
602adf40 77#define DEV_NAME_LEN 32
81a89793 78#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
602adf40 79
cc0538b6 80#define RBD_READ_ONLY_DEFAULT false
59c2be1e 81
602adf40
YS
82/*
83 * block device image metadata (in-memory version)
84 */
85struct rbd_image_header {
f84344f3 86 /* These four fields never change for a given rbd image */
849b4260 87 char *object_prefix;
34b13184 88 u64 features;
602adf40
YS
89 __u8 obj_order;
90 __u8 crypt_type;
91 __u8 comp_type;
602adf40 92
f84344f3
AE
93 /* The remaining fields need to be updated occasionally */
94 u64 image_size;
95 struct ceph_snap_context *snapc;
602adf40
YS
96 char *snap_names;
97 u64 *snap_sizes;
59c2be1e
YS
98
99 u64 obj_version;
100};
101
102struct rbd_options {
cc0538b6 103 bool read_only;
602adf40
YS
104};
105
106/*
f0f8cef5 107 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
108 */
109struct rbd_client {
110 struct ceph_client *client;
111 struct kref kref;
112 struct list_head node;
113};
114
115/*
f0f8cef5 116 * a request completion status
602adf40 117 */
1fec7093
YS
118struct rbd_req_status {
119 int done;
120 int rc;
121 u64 bytes;
122};
123
124/*
125 * a collection of requests
126 */
127struct rbd_req_coll {
128 int total;
129 int num_done;
130 struct kref kref;
131 struct rbd_req_status status[0];
602adf40
YS
132};
133
f0f8cef5
AE
134/*
135 * a single io request
136 */
137struct rbd_request {
138 struct request *rq; /* blk layer request */
139 struct bio *bio; /* cloned bio */
140 struct page **pages; /* list of used pages */
141 u64 len;
142 int coll_index;
143 struct rbd_req_coll *coll;
144};
145
dfc5606d
YS
146struct rbd_snap {
147 struct device dev;
148 const char *name;
3591538f 149 u64 size;
dfc5606d
YS
150 struct list_head node;
151 u64 id;
34b13184 152 u64 features;
dfc5606d
YS
153};
154
f84344f3
AE
155struct rbd_mapping {
156 char *snap_name;
157 u64 snap_id;
99c1f08f 158 u64 size;
34b13184 159 u64 features;
f84344f3
AE
160 bool snap_exists;
161 bool read_only;
162};
163
602adf40
YS
164/*
165 * a single device
166 */
167struct rbd_device {
de71a297 168 int dev_id; /* blkdev unique id */
602adf40
YS
169
170 int major; /* blkdev assigned major */
171 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 172
a30b71b9 173 u32 image_format; /* Either 1 or 2 */
f8c38929 174 struct rbd_options rbd_opts;
602adf40
YS
175 struct rbd_client *rbd_client;
176
177 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
178
179 spinlock_t lock; /* queue lock */
180
181 struct rbd_image_header header;
589d30e0
AE
182 char *image_id;
183 size_t image_id_len;
0bed54dc
AE
184 char *image_name;
185 size_t image_name_len;
186 char *header_name;
d22f76e7 187 char *pool_name;
9bb2f334 188 int pool_id;
602adf40 189
59c2be1e
YS
190 struct ceph_osd_event *watch_event;
191 struct ceph_osd_request *watch_request;
192
c666601a
JD
193 /* protects updating the header */
194 struct rw_semaphore header_rwsem;
f84344f3
AE
195
196 struct rbd_mapping mapping;
602adf40
YS
197
198 struct list_head node;
dfc5606d
YS
199
200 /* list of snapshots */
201 struct list_head snaps;
202
203 /* sysfs related */
204 struct device dev;
205};
206
602adf40 207static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
e124a82f 208
602adf40 209static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
210static DEFINE_SPINLOCK(rbd_dev_list_lock);
211
432b8587
AE
212static LIST_HEAD(rbd_client_list); /* clients */
213static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 214
304f6808
AE
215static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
216static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
217
dfc5606d 218static void rbd_dev_release(struct device *dev);
14e7085d 219static void __rbd_remove_snap_dev(struct rbd_snap *snap);
dfc5606d 220
f0f8cef5
AE
221static ssize_t rbd_add(struct bus_type *bus, const char *buf,
222 size_t count);
223static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
224 size_t count);
225
226static struct bus_attribute rbd_bus_attrs[] = {
227 __ATTR(add, S_IWUSR, NULL, rbd_add),
228 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
229 __ATTR_NULL
230};
231
232static struct bus_type rbd_bus_type = {
233 .name = "rbd",
234 .bus_attrs = rbd_bus_attrs,
235};
236
237static void rbd_root_dev_release(struct device *dev)
238{
239}
240
241static struct device rbd_root_dev = {
242 .init_name = "rbd",
243 .release = rbd_root_dev_release,
244};
245
aafb230e
AE
246#ifdef RBD_DEBUG
247#define rbd_assert(expr) \
248 if (unlikely(!(expr))) { \
249 printk(KERN_ERR "\nAssertion failure in %s() " \
250 "at line %d:\n\n" \
251 "\trbd_assert(%s);\n\n", \
252 __func__, __LINE__, #expr); \
253 BUG(); \
254 }
255#else /* !RBD_DEBUG */
256# define rbd_assert(expr) ((void) 0)
257#endif /* !RBD_DEBUG */
dfc5606d 258
dfc5606d
YS
259static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
260{
261 return get_device(&rbd_dev->dev);
262}
263
264static void rbd_put_dev(struct rbd_device *rbd_dev)
265{
266 put_device(&rbd_dev->dev);
267}
602adf40 268
1fe5e993 269static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
59c2be1e 270
602adf40
YS
271static int rbd_open(struct block_device *bdev, fmode_t mode)
272{
f0f8cef5 273 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
602adf40 274
f84344f3 275 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
602adf40
YS
276 return -EROFS;
277
340c7a2b 278 rbd_get_dev(rbd_dev);
f84344f3 279 set_device_ro(bdev, rbd_dev->mapping.read_only);
340c7a2b 280
602adf40
YS
281 return 0;
282}
283
dfc5606d
YS
284static int rbd_release(struct gendisk *disk, fmode_t mode)
285{
286 struct rbd_device *rbd_dev = disk->private_data;
287
288 rbd_put_dev(rbd_dev);
289
290 return 0;
291}
292
602adf40
YS
293static const struct block_device_operations rbd_bd_ops = {
294 .owner = THIS_MODULE,
295 .open = rbd_open,
dfc5606d 296 .release = rbd_release,
602adf40
YS
297};
298
299/*
300 * Initialize an rbd client instance.
43ae4701 301 * We own *ceph_opts.
602adf40 302 */
f8c38929 303static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
304{
305 struct rbd_client *rbdc;
306 int ret = -ENOMEM;
307
308 dout("rbd_client_create\n");
309 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
310 if (!rbdc)
311 goto out_opt;
312
313 kref_init(&rbdc->kref);
314 INIT_LIST_HEAD(&rbdc->node);
315
bc534d86
AE
316 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
317
43ae4701 318 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
602adf40 319 if (IS_ERR(rbdc->client))
bc534d86 320 goto out_mutex;
43ae4701 321 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
322
323 ret = ceph_open_session(rbdc->client);
324 if (ret < 0)
325 goto out_err;
326
432b8587 327 spin_lock(&rbd_client_list_lock);
602adf40 328 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 329 spin_unlock(&rbd_client_list_lock);
602adf40 330
bc534d86
AE
331 mutex_unlock(&ctl_mutex);
332
602adf40
YS
333 dout("rbd_client_create created %p\n", rbdc);
334 return rbdc;
335
336out_err:
337 ceph_destroy_client(rbdc->client);
bc534d86
AE
338out_mutex:
339 mutex_unlock(&ctl_mutex);
602adf40
YS
340 kfree(rbdc);
341out_opt:
43ae4701
AE
342 if (ceph_opts)
343 ceph_destroy_options(ceph_opts);
28f259b7 344 return ERR_PTR(ret);
602adf40
YS
345}
346
347/*
1f7ba331
AE
348 * Find a ceph client with specific addr and configuration. If
349 * found, bump its reference count.
602adf40 350 */
1f7ba331 351static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
352{
353 struct rbd_client *client_node;
1f7ba331 354 bool found = false;
602adf40 355
43ae4701 356 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
357 return NULL;
358
1f7ba331
AE
359 spin_lock(&rbd_client_list_lock);
360 list_for_each_entry(client_node, &rbd_client_list, node) {
361 if (!ceph_compare_options(ceph_opts, client_node->client)) {
362 kref_get(&client_node->kref);
363 found = true;
364 break;
365 }
366 }
367 spin_unlock(&rbd_client_list_lock);
368
369 return found ? client_node : NULL;
602adf40
YS
370}
371
59c2be1e
YS
372/*
373 * mount options
374 */
375enum {
59c2be1e
YS
376 Opt_last_int,
377 /* int args above */
378 Opt_last_string,
379 /* string args above */
cc0538b6
AE
380 Opt_read_only,
381 Opt_read_write,
382 /* Boolean args above */
383 Opt_last_bool,
59c2be1e
YS
384};
385
43ae4701 386static match_table_t rbd_opts_tokens = {
59c2be1e
YS
387 /* int args above */
388 /* string args above */
f84344f3 389 {Opt_read_only, "mapping.read_only"},
cc0538b6
AE
390 {Opt_read_only, "ro"}, /* Alternate spelling */
391 {Opt_read_write, "read_write"},
392 {Opt_read_write, "rw"}, /* Alternate spelling */
393 /* Boolean args above */
59c2be1e
YS
394 {-1, NULL}
395};
396
397static int parse_rbd_opts_token(char *c, void *private)
398{
43ae4701 399 struct rbd_options *rbd_opts = private;
59c2be1e
YS
400 substring_t argstr[MAX_OPT_ARGS];
401 int token, intval, ret;
402
43ae4701 403 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
404 if (token < 0)
405 return -EINVAL;
406
407 if (token < Opt_last_int) {
408 ret = match_int(&argstr[0], &intval);
409 if (ret < 0) {
410 pr_err("bad mount option arg (not int) "
411 "at '%s'\n", c);
412 return ret;
413 }
414 dout("got int token %d val %d\n", token, intval);
415 } else if (token > Opt_last_int && token < Opt_last_string) {
416 dout("got string token %d val %s\n", token,
417 argstr[0].from);
cc0538b6
AE
418 } else if (token > Opt_last_string && token < Opt_last_bool) {
419 dout("got Boolean token %d\n", token);
59c2be1e
YS
420 } else {
421 dout("got token %d\n", token);
422 }
423
424 switch (token) {
cc0538b6
AE
425 case Opt_read_only:
426 rbd_opts->read_only = true;
427 break;
428 case Opt_read_write:
429 rbd_opts->read_only = false;
430 break;
59c2be1e 431 default:
aafb230e
AE
432 rbd_assert(false);
433 break;
59c2be1e
YS
434 }
435 return 0;
436}
437
602adf40
YS
438/*
439 * Get a ceph client with specific addr and configuration, if one does
440 * not exist create it.
441 */
f8c38929
AE
442static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
443 size_t mon_addr_len, char *options)
602adf40 444{
f8c38929 445 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
43ae4701 446 struct ceph_options *ceph_opts;
f8c38929 447 struct rbd_client *rbdc;
59c2be1e 448
cc0538b6 449 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
602adf40 450
43ae4701
AE
451 ceph_opts = ceph_parse_options(options, mon_addr,
452 mon_addr + mon_addr_len,
453 parse_rbd_opts_token, rbd_opts);
f8c38929
AE
454 if (IS_ERR(ceph_opts))
455 return PTR_ERR(ceph_opts);
602adf40 456
1f7ba331 457 rbdc = rbd_client_find(ceph_opts);
602adf40 458 if (rbdc) {
602adf40 459 /* using an existing client */
43ae4701 460 ceph_destroy_options(ceph_opts);
f8c38929
AE
461 } else {
462 rbdc = rbd_client_create(ceph_opts);
463 if (IS_ERR(rbdc))
464 return PTR_ERR(rbdc);
602adf40 465 }
f8c38929 466 rbd_dev->rbd_client = rbdc;
602adf40 467
f8c38929 468 return 0;
602adf40
YS
469}
470
471/*
472 * Destroy ceph client
d23a4b3f 473 *
432b8587 474 * Caller must hold rbd_client_list_lock.
602adf40
YS
475 */
476static void rbd_client_release(struct kref *kref)
477{
478 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
479
480 dout("rbd_release_client %p\n", rbdc);
cd9d9f5d 481 spin_lock(&rbd_client_list_lock);
602adf40 482 list_del(&rbdc->node);
cd9d9f5d 483 spin_unlock(&rbd_client_list_lock);
602adf40
YS
484
485 ceph_destroy_client(rbdc->client);
486 kfree(rbdc);
487}
488
489/*
490 * Drop reference to ceph client node. If it's not referenced anymore, release
491 * it.
492 */
493static void rbd_put_client(struct rbd_device *rbd_dev)
494{
495 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
496 rbd_dev->rbd_client = NULL;
602adf40
YS
497}
498
1fec7093
YS
499/*
500 * Destroy requests collection
501 */
502static void rbd_coll_release(struct kref *kref)
503{
504 struct rbd_req_coll *coll =
505 container_of(kref, struct rbd_req_coll, kref);
506
507 dout("rbd_coll_release %p\n", coll);
508 kfree(coll);
509}
602adf40 510
a30b71b9
AE
511static bool rbd_image_format_valid(u32 image_format)
512{
513 return image_format == 1 || image_format == 2;
514}
515
8e94af8e
AE
516static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
517{
103a150f
AE
518 size_t size;
519 u32 snap_count;
520
521 /* The header has to start with the magic rbd header text */
522 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
523 return false;
524
525 /*
526 * The size of a snapshot header has to fit in a size_t, and
527 * that limits the number of snapshots.
528 */
529 snap_count = le32_to_cpu(ondisk->snap_count);
530 size = SIZE_MAX - sizeof (struct ceph_snap_context);
531 if (snap_count > size / sizeof (__le64))
532 return false;
533
534 /*
535 * Not only that, but the size of the entire the snapshot
536 * header must also be representable in a size_t.
537 */
538 size -= snap_count * sizeof (__le64);
539 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
540 return false;
541
542 return true;
8e94af8e
AE
543}
544
602adf40
YS
545/*
546 * Create a new header structure, translate header format from the on-disk
547 * header.
548 */
549static int rbd_header_from_disk(struct rbd_image_header *header,
4156d998 550 struct rbd_image_header_ondisk *ondisk)
602adf40 551{
ccece235 552 u32 snap_count;
58c17b0e 553 size_t len;
d2bb24e5 554 size_t size;
621901d6 555 u32 i;
602adf40 556
6a52325f
AE
557 memset(header, 0, sizeof (*header));
558
103a150f
AE
559 snap_count = le32_to_cpu(ondisk->snap_count);
560
58c17b0e
AE
561 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
562 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6a52325f 563 if (!header->object_prefix)
602adf40 564 return -ENOMEM;
58c17b0e
AE
565 memcpy(header->object_prefix, ondisk->object_prefix, len);
566 header->object_prefix[len] = '\0';
00f1f36f 567
602adf40 568 if (snap_count) {
f785cc1d
AE
569 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
570
621901d6
AE
571 /* Save a copy of the snapshot names */
572
f785cc1d
AE
573 if (snap_names_len > (u64) SIZE_MAX)
574 return -EIO;
575 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
602adf40 576 if (!header->snap_names)
6a52325f 577 goto out_err;
f785cc1d
AE
578 /*
579 * Note that rbd_dev_v1_header_read() guarantees
580 * the ondisk buffer we're working with has
581 * snap_names_len bytes beyond the end of the
582 * snapshot id array, this memcpy() is safe.
583 */
584 memcpy(header->snap_names, &ondisk->snaps[snap_count],
585 snap_names_len);
6a52325f 586
621901d6
AE
587 /* Record each snapshot's size */
588
d2bb24e5
AE
589 size = snap_count * sizeof (*header->snap_sizes);
590 header->snap_sizes = kmalloc(size, GFP_KERNEL);
602adf40 591 if (!header->snap_sizes)
6a52325f 592 goto out_err;
621901d6
AE
593 for (i = 0; i < snap_count; i++)
594 header->snap_sizes[i] =
595 le64_to_cpu(ondisk->snaps[i].image_size);
602adf40 596 } else {
ccece235 597 WARN_ON(ondisk->snap_names_len);
602adf40
YS
598 header->snap_names = NULL;
599 header->snap_sizes = NULL;
600 }
849b4260 601
34b13184 602 header->features = 0; /* No features support in v1 images */
602adf40
YS
603 header->obj_order = ondisk->options.order;
604 header->crypt_type = ondisk->options.crypt_type;
605 header->comp_type = ondisk->options.comp_type;
6a52325f 606
621901d6
AE
607 /* Allocate and fill in the snapshot context */
608
f84344f3 609 header->image_size = le64_to_cpu(ondisk->image_size);
6a52325f
AE
610 size = sizeof (struct ceph_snap_context);
611 size += snap_count * sizeof (header->snapc->snaps[0]);
612 header->snapc = kzalloc(size, GFP_KERNEL);
613 if (!header->snapc)
614 goto out_err;
602adf40
YS
615
616 atomic_set(&header->snapc->nref, 1);
505cbb9b 617 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 618 header->snapc->num_snaps = snap_count;
621901d6
AE
619 for (i = 0; i < snap_count; i++)
620 header->snapc->snaps[i] =
621 le64_to_cpu(ondisk->snaps[i].id);
602adf40
YS
622
623 return 0;
624
6a52325f 625out_err:
849b4260 626 kfree(header->snap_sizes);
ccece235 627 header->snap_sizes = NULL;
602adf40 628 kfree(header->snap_names);
ccece235 629 header->snap_names = NULL;
6a52325f
AE
630 kfree(header->object_prefix);
631 header->object_prefix = NULL;
ccece235 632
00f1f36f 633 return -ENOMEM;
602adf40
YS
634}
635
8836b995 636static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
602adf40 637{
602adf40 638
e86924a8 639 struct rbd_snap *snap;
602adf40 640
e86924a8
AE
641 list_for_each_entry(snap, &rbd_dev->snaps, node) {
642 if (!strcmp(snap_name, snap->name)) {
643 rbd_dev->mapping.snap_id = snap->id;
644 rbd_dev->mapping.size = snap->size;
34b13184 645 rbd_dev->mapping.features = snap->features;
602adf40 646
e86924a8 647 return 0;
00f1f36f 648 }
00f1f36f 649 }
e86924a8 650
00f1f36f 651 return -ENOENT;
602adf40
YS
652}
653
5ed16177 654static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
602adf40 655{
78dc447d 656 int ret;
602adf40 657
4e1105a2 658 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
cc9d734c 659 sizeof (RBD_SNAP_HEAD_NAME))) {
f84344f3 660 rbd_dev->mapping.snap_id = CEPH_NOSNAP;
99c1f08f 661 rbd_dev->mapping.size = rbd_dev->header.image_size;
34b13184 662 rbd_dev->mapping.features = rbd_dev->header.features;
f84344f3
AE
663 rbd_dev->mapping.snap_exists = false;
664 rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
e86924a8 665 ret = 0;
602adf40 666 } else {
8836b995 667 ret = snap_by_name(rbd_dev, snap_name);
602adf40
YS
668 if (ret < 0)
669 goto done;
f84344f3
AE
670 rbd_dev->mapping.snap_exists = true;
671 rbd_dev->mapping.read_only = true;
602adf40 672 }
4e1105a2 673 rbd_dev->mapping.snap_name = snap_name;
602adf40 674done:
602adf40
YS
675 return ret;
676}
677
678static void rbd_header_free(struct rbd_image_header *header)
679{
849b4260 680 kfree(header->object_prefix);
d78fd7ae 681 header->object_prefix = NULL;
602adf40 682 kfree(header->snap_sizes);
d78fd7ae 683 header->snap_sizes = NULL;
849b4260 684 kfree(header->snap_names);
d78fd7ae 685 header->snap_names = NULL;
d1d25646 686 ceph_put_snap_context(header->snapc);
d78fd7ae 687 header->snapc = NULL;
602adf40
YS
688}
689
65ccfe21 690static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
602adf40 691{
65ccfe21
AE
692 char *name;
693 u64 segment;
694 int ret;
602adf40 695
65ccfe21
AE
696 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
697 if (!name)
698 return NULL;
699 segment = offset >> rbd_dev->header.obj_order;
700 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
701 rbd_dev->header.object_prefix, segment);
702 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
703 pr_err("error formatting segment name for #%llu (%d)\n",
704 segment, ret);
705 kfree(name);
706 name = NULL;
707 }
602adf40 708
65ccfe21
AE
709 return name;
710}
602adf40 711
65ccfe21
AE
712static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
713{
714 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
602adf40 715
65ccfe21
AE
716 return offset & (segment_size - 1);
717}
718
719static u64 rbd_segment_length(struct rbd_device *rbd_dev,
720 u64 offset, u64 length)
721{
722 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
723
724 offset &= segment_size - 1;
725
aafb230e 726 rbd_assert(length <= U64_MAX - offset);
65ccfe21
AE
727 if (offset + length > segment_size)
728 length = segment_size - offset;
729
730 return length;
602adf40
YS
731}
732
1fec7093
YS
733static int rbd_get_num_segments(struct rbd_image_header *header,
734 u64 ofs, u64 len)
735{
df111be6
AE
736 u64 start_seg;
737 u64 end_seg;
738
739 if (!len)
740 return 0;
741 if (len - 1 > U64_MAX - ofs)
742 return -ERANGE;
743
744 start_seg = ofs >> header->obj_order;
745 end_seg = (ofs + len - 1) >> header->obj_order;
746
1fec7093
YS
747 return end_seg - start_seg + 1;
748}
749
029bcbd8
JD
750/*
751 * returns the size of an object in the image
752 */
753static u64 rbd_obj_bytes(struct rbd_image_header *header)
754{
755 return 1 << header->obj_order;
756}
757
602adf40
YS
758/*
759 * bio helpers
760 */
761
762static void bio_chain_put(struct bio *chain)
763{
764 struct bio *tmp;
765
766 while (chain) {
767 tmp = chain;
768 chain = chain->bi_next;
769 bio_put(tmp);
770 }
771}
772
773/*
774 * zeros a bio chain, starting at specific offset
775 */
776static void zero_bio_chain(struct bio *chain, int start_ofs)
777{
778 struct bio_vec *bv;
779 unsigned long flags;
780 void *buf;
781 int i;
782 int pos = 0;
783
784 while (chain) {
785 bio_for_each_segment(bv, chain, i) {
786 if (pos + bv->bv_len > start_ofs) {
787 int remainder = max(start_ofs - pos, 0);
788 buf = bvec_kmap_irq(bv, &flags);
789 memset(buf + remainder, 0,
790 bv->bv_len - remainder);
85b5aaa6 791 bvec_kunmap_irq(buf, &flags);
602adf40
YS
792 }
793 pos += bv->bv_len;
794 }
795
796 chain = chain->bi_next;
797 }
798}
799
800/*
801 * bio_chain_clone - clone a chain of bios up to a certain length.
802 * might return a bio_pair that will need to be released.
803 */
804static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
805 struct bio_pair **bp,
806 int len, gfp_t gfpmask)
807{
542582fc
AE
808 struct bio *old_chain = *old;
809 struct bio *new_chain = NULL;
810 struct bio *tail;
602adf40
YS
811 int total = 0;
812
813 if (*bp) {
814 bio_pair_release(*bp);
815 *bp = NULL;
816 }
817
818 while (old_chain && (total < len)) {
542582fc
AE
819 struct bio *tmp;
820
602adf40
YS
821 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
822 if (!tmp)
823 goto err_out;
542582fc 824 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
602adf40
YS
825
826 if (total + old_chain->bi_size > len) {
827 struct bio_pair *bp;
828
829 /*
830 * this split can only happen with a single paged bio,
831 * split_bio will BUG_ON if this is not the case
832 */
833 dout("bio_chain_clone split! total=%d remaining=%d"
bd919d45
AE
834 "bi_size=%u\n",
835 total, len - total, old_chain->bi_size);
602adf40
YS
836
837 /* split the bio. We'll release it either in the next
838 call, or it will have to be released outside */
593a9e7b 839 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
602adf40
YS
840 if (!bp)
841 goto err_out;
842
843 __bio_clone(tmp, &bp->bio1);
844
845 *next = &bp->bio2;
846 } else {
847 __bio_clone(tmp, old_chain);
848 *next = old_chain->bi_next;
849 }
850
851 tmp->bi_bdev = NULL;
602adf40 852 tmp->bi_next = NULL;
542582fc 853 if (new_chain)
602adf40 854 tail->bi_next = tmp;
542582fc
AE
855 else
856 new_chain = tmp;
857 tail = tmp;
602adf40
YS
858 old_chain = old_chain->bi_next;
859
860 total += tmp->bi_size;
861 }
862
aafb230e 863 rbd_assert(total == len);
602adf40 864
602adf40
YS
865 *old = old_chain;
866
867 return new_chain;
868
869err_out:
870 dout("bio_chain_clone with err\n");
871 bio_chain_put(new_chain);
872 return NULL;
873}
874
875/*
876 * helpers for osd request op vectors.
877 */
57cfc106
AE
878static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
879 int opcode, u32 payload_len)
602adf40 880{
57cfc106
AE
881 struct ceph_osd_req_op *ops;
882
883 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
884 if (!ops)
885 return NULL;
886
887 ops[0].op = opcode;
888
602adf40
YS
889 /*
890 * op extent offset and length will be set later on
891 * in calc_raw_layout()
892 */
57cfc106
AE
893 ops[0].payload_len = payload_len;
894
895 return ops;
602adf40
YS
896}
897
898static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
899{
900 kfree(ops);
901}
902
1fec7093
YS
903static void rbd_coll_end_req_index(struct request *rq,
904 struct rbd_req_coll *coll,
905 int index,
906 int ret, u64 len)
907{
908 struct request_queue *q;
909 int min, max, i;
910
bd919d45
AE
911 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
912 coll, index, ret, (unsigned long long) len);
1fec7093
YS
913
914 if (!rq)
915 return;
916
917 if (!coll) {
918 blk_end_request(rq, ret, len);
919 return;
920 }
921
922 q = rq->q;
923
924 spin_lock_irq(q->queue_lock);
925 coll->status[index].done = 1;
926 coll->status[index].rc = ret;
927 coll->status[index].bytes = len;
928 max = min = coll->num_done;
929 while (max < coll->total && coll->status[max].done)
930 max++;
931
932 for (i = min; i<max; i++) {
933 __blk_end_request(rq, coll->status[i].rc,
934 coll->status[i].bytes);
935 coll->num_done++;
936 kref_put(&coll->kref, rbd_coll_release);
937 }
938 spin_unlock_irq(q->queue_lock);
939}
940
941static void rbd_coll_end_req(struct rbd_request *req,
942 int ret, u64 len)
943{
944 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
945}
946
602adf40
YS
947/*
948 * Send ceph osd request
949 */
950static int rbd_do_request(struct request *rq,
0ce1a794 951 struct rbd_device *rbd_dev,
602adf40
YS
952 struct ceph_snap_context *snapc,
953 u64 snapid,
aded07ea 954 const char *object_name, u64 ofs, u64 len,
602adf40
YS
955 struct bio *bio,
956 struct page **pages,
957 int num_pages,
958 int flags,
959 struct ceph_osd_req_op *ops,
1fec7093
YS
960 struct rbd_req_coll *coll,
961 int coll_index,
602adf40 962 void (*rbd_cb)(struct ceph_osd_request *req,
59c2be1e
YS
963 struct ceph_msg *msg),
964 struct ceph_osd_request **linger_req,
965 u64 *ver)
602adf40
YS
966{
967 struct ceph_osd_request *req;
968 struct ceph_file_layout *layout;
969 int ret;
970 u64 bno;
971 struct timespec mtime = CURRENT_TIME;
972 struct rbd_request *req_data;
973 struct ceph_osd_request_head *reqhead;
1dbb4399 974 struct ceph_osd_client *osdc;
602adf40 975
602adf40 976 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
1fec7093
YS
977 if (!req_data) {
978 if (coll)
979 rbd_coll_end_req_index(rq, coll, coll_index,
980 -ENOMEM, len);
981 return -ENOMEM;
982 }
983
984 if (coll) {
985 req_data->coll = coll;
986 req_data->coll_index = coll_index;
987 }
602adf40 988
bd919d45
AE
989 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
990 (unsigned long long) ofs, (unsigned long long) len);
602adf40 991
0ce1a794 992 osdc = &rbd_dev->rbd_client->client->osdc;
1dbb4399
AE
993 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
994 false, GFP_NOIO, pages, bio);
4ad12621 995 if (!req) {
4ad12621 996 ret = -ENOMEM;
602adf40
YS
997 goto done_pages;
998 }
999
1000 req->r_callback = rbd_cb;
1001
1002 req_data->rq = rq;
1003 req_data->bio = bio;
1004 req_data->pages = pages;
1005 req_data->len = len;
1006
1007 req->r_priv = req_data;
1008
1009 reqhead = req->r_request->front.iov_base;
1010 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1011
aded07ea 1012 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
602adf40
YS
1013 req->r_oid_len = strlen(req->r_oid);
1014
1015 layout = &req->r_file_layout;
1016 memset(layout, 0, sizeof(*layout));
1017 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1018 layout->fl_stripe_count = cpu_to_le32(1);
1019 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
0ce1a794 1020 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
1dbb4399
AE
1021 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1022 req, ops);
602adf40
YS
1023
1024 ceph_osdc_build_request(req, ofs, &len,
1025 ops,
1026 snapc,
1027 &mtime,
1028 req->r_oid, req->r_oid_len);
602adf40 1029
59c2be1e 1030 if (linger_req) {
1dbb4399 1031 ceph_osdc_set_request_linger(osdc, req);
59c2be1e
YS
1032 *linger_req = req;
1033 }
1034
1dbb4399 1035 ret = ceph_osdc_start_request(osdc, req, false);
602adf40
YS
1036 if (ret < 0)
1037 goto done_err;
1038
1039 if (!rbd_cb) {
1dbb4399 1040 ret = ceph_osdc_wait_request(osdc, req);
59c2be1e
YS
1041 if (ver)
1042 *ver = le64_to_cpu(req->r_reassert_version.version);
bd919d45
AE
1043 dout("reassert_ver=%llu\n",
1044 (unsigned long long)
1045 le64_to_cpu(req->r_reassert_version.version));
602adf40
YS
1046 ceph_osdc_put_request(req);
1047 }
1048 return ret;
1049
1050done_err:
1051 bio_chain_put(req_data->bio);
1052 ceph_osdc_put_request(req);
1053done_pages:
1fec7093 1054 rbd_coll_end_req(req_data, ret, len);
602adf40 1055 kfree(req_data);
602adf40
YS
1056 return ret;
1057}
1058
1059/*
1060 * Ceph osd op callback
1061 */
1062static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1063{
1064 struct rbd_request *req_data = req->r_priv;
1065 struct ceph_osd_reply_head *replyhead;
1066 struct ceph_osd_op *op;
1067 __s32 rc;
1068 u64 bytes;
1069 int read_op;
1070
1071 /* parse reply */
1072 replyhead = msg->front.iov_base;
1073 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1074 op = (void *)(replyhead + 1);
1075 rc = le32_to_cpu(replyhead->result);
1076 bytes = le64_to_cpu(op->extent.length);
895cfcc8 1077 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
602adf40 1078
bd919d45
AE
1079 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1080 (unsigned long long) bytes, read_op, (int) rc);
602adf40
YS
1081
1082 if (rc == -ENOENT && read_op) {
1083 zero_bio_chain(req_data->bio, 0);
1084 rc = 0;
1085 } else if (rc == 0 && read_op && bytes < req_data->len) {
1086 zero_bio_chain(req_data->bio, bytes);
1087 bytes = req_data->len;
1088 }
1089
1fec7093 1090 rbd_coll_end_req(req_data, rc, bytes);
602adf40
YS
1091
1092 if (req_data->bio)
1093 bio_chain_put(req_data->bio);
1094
1095 ceph_osdc_put_request(req);
1096 kfree(req_data);
1097}
1098
59c2be1e
YS
1099static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1100{
1101 ceph_osdc_put_request(req);
1102}
1103
602adf40
YS
1104/*
1105 * Do a synchronous ceph osd operation
1106 */
0ce1a794 1107static int rbd_req_sync_op(struct rbd_device *rbd_dev,
602adf40
YS
1108 struct ceph_snap_context *snapc,
1109 u64 snapid,
602adf40 1110 int flags,
913d2fdc 1111 struct ceph_osd_req_op *ops,
aded07ea 1112 const char *object_name,
f8d4de6e
AE
1113 u64 ofs, u64 inbound_size,
1114 char *inbound,
59c2be1e
YS
1115 struct ceph_osd_request **linger_req,
1116 u64 *ver)
602adf40
YS
1117{
1118 int ret;
1119 struct page **pages;
1120 int num_pages;
913d2fdc 1121
aafb230e 1122 rbd_assert(ops != NULL);
602adf40 1123
f8d4de6e 1124 num_pages = calc_pages_for(ofs, inbound_size);
602adf40 1125 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
b8d0638a
DC
1126 if (IS_ERR(pages))
1127 return PTR_ERR(pages);
602adf40 1128
0ce1a794 1129 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
f8d4de6e 1130 object_name, ofs, inbound_size, NULL,
602adf40
YS
1131 pages, num_pages,
1132 flags,
1133 ops,
1fec7093 1134 NULL, 0,
59c2be1e
YS
1135 NULL,
1136 linger_req, ver);
602adf40 1137 if (ret < 0)
913d2fdc 1138 goto done;
602adf40 1139
f8d4de6e
AE
1140 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1141 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
602adf40 1142
602adf40
YS
1143done:
1144 ceph_release_page_vector(pages, num_pages);
1145 return ret;
1146}
1147
1148/*
1149 * Do an asynchronous ceph osd operation
1150 */
1151static int rbd_do_op(struct request *rq,
0ce1a794 1152 struct rbd_device *rbd_dev,
602adf40
YS
1153 struct ceph_snap_context *snapc,
1154 u64 snapid,
d1f57ea6 1155 int opcode, int flags,
602adf40 1156 u64 ofs, u64 len,
1fec7093
YS
1157 struct bio *bio,
1158 struct rbd_req_coll *coll,
1159 int coll_index)
602adf40
YS
1160{
1161 char *seg_name;
1162 u64 seg_ofs;
1163 u64 seg_len;
1164 int ret;
1165 struct ceph_osd_req_op *ops;
1166 u32 payload_len;
1167
65ccfe21 1168 seg_name = rbd_segment_name(rbd_dev, ofs);
602adf40
YS
1169 if (!seg_name)
1170 return -ENOMEM;
65ccfe21
AE
1171 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1172 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
602adf40
YS
1173
1174 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1175
57cfc106
AE
1176 ret = -ENOMEM;
1177 ops = rbd_create_rw_ops(1, opcode, payload_len);
1178 if (!ops)
602adf40
YS
1179 goto done;
1180
1181 /* we've taken care of segment sizes earlier when we
1182 cloned the bios. We should never have a segment
1183 truncated at this point */
aafb230e 1184 rbd_assert(seg_len == len);
602adf40
YS
1185
1186 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1187 seg_name, seg_ofs, seg_len,
1188 bio,
1189 NULL, 0,
1190 flags,
1191 ops,
1fec7093 1192 coll, coll_index,
59c2be1e 1193 rbd_req_cb, 0, NULL);
11f77002
SW
1194
1195 rbd_destroy_ops(ops);
602adf40
YS
1196done:
1197 kfree(seg_name);
1198 return ret;
1199}
1200
1201/*
1202 * Request async osd write
1203 */
1204static int rbd_req_write(struct request *rq,
1205 struct rbd_device *rbd_dev,
1206 struct ceph_snap_context *snapc,
1207 u64 ofs, u64 len,
1fec7093
YS
1208 struct bio *bio,
1209 struct rbd_req_coll *coll,
1210 int coll_index)
602adf40
YS
1211{
1212 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1213 CEPH_OSD_OP_WRITE,
1214 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1fec7093 1215 ofs, len, bio, coll, coll_index);
602adf40
YS
1216}
1217
1218/*
1219 * Request async osd read
1220 */
1221static int rbd_req_read(struct request *rq,
1222 struct rbd_device *rbd_dev,
1223 u64 snapid,
1224 u64 ofs, u64 len,
1fec7093
YS
1225 struct bio *bio,
1226 struct rbd_req_coll *coll,
1227 int coll_index)
602adf40
YS
1228{
1229 return rbd_do_op(rq, rbd_dev, NULL,
b06e6a6b 1230 snapid,
602adf40
YS
1231 CEPH_OSD_OP_READ,
1232 CEPH_OSD_FLAG_READ,
1fec7093 1233 ofs, len, bio, coll, coll_index);
602adf40
YS
1234}
1235
1236/*
1237 * Request sync osd read
1238 */
0ce1a794 1239static int rbd_req_sync_read(struct rbd_device *rbd_dev,
602adf40 1240 u64 snapid,
aded07ea 1241 const char *object_name,
602adf40 1242 u64 ofs, u64 len,
59c2be1e
YS
1243 char *buf,
1244 u64 *ver)
602adf40 1245{
913d2fdc
AE
1246 struct ceph_osd_req_op *ops;
1247 int ret;
1248
1249 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1250 if (!ops)
1251 return -ENOMEM;
1252
1253 ret = rbd_req_sync_op(rbd_dev, NULL,
b06e6a6b 1254 snapid,
602adf40 1255 CEPH_OSD_FLAG_READ,
913d2fdc
AE
1256 ops, object_name, ofs, len, buf, NULL, ver);
1257 rbd_destroy_ops(ops);
1258
1259 return ret;
602adf40
YS
1260}
1261
1262/*
59c2be1e
YS
1263 * Request sync osd watch
1264 */
0ce1a794 1265static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
59c2be1e 1266 u64 ver,
7f0a24d8 1267 u64 notify_id)
59c2be1e
YS
1268{
1269 struct ceph_osd_req_op *ops;
11f77002
SW
1270 int ret;
1271
57cfc106
AE
1272 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1273 if (!ops)
1274 return -ENOMEM;
59c2be1e 1275
a71b891b 1276 ops[0].watch.ver = cpu_to_le64(ver);
59c2be1e
YS
1277 ops[0].watch.cookie = notify_id;
1278 ops[0].watch.flag = 0;
1279
0ce1a794 1280 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
7f0a24d8 1281 rbd_dev->header_name, 0, 0, NULL,
ad4f232f 1282 NULL, 0,
59c2be1e
YS
1283 CEPH_OSD_FLAG_READ,
1284 ops,
1fec7093 1285 NULL, 0,
59c2be1e
YS
1286 rbd_simple_req_cb, 0, NULL);
1287
1288 rbd_destroy_ops(ops);
1289 return ret;
1290}
1291
1292static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1293{
0ce1a794 1294 struct rbd_device *rbd_dev = (struct rbd_device *)data;
a71b891b 1295 u64 hver;
13143d2d
SW
1296 int rc;
1297
0ce1a794 1298 if (!rbd_dev)
59c2be1e
YS
1299 return;
1300
bd919d45
AE
1301 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1302 rbd_dev->header_name, (unsigned long long) notify_id,
1303 (unsigned int) opcode);
1fe5e993 1304 rc = rbd_refresh_header(rbd_dev, &hver);
13143d2d 1305 if (rc)
f0f8cef5 1306 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
0ce1a794 1307 " update snaps: %d\n", rbd_dev->major, rc);
59c2be1e 1308
7f0a24d8 1309 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
59c2be1e
YS
1310}
1311
1312/*
1313 * Request sync osd watch
1314 */
0e6f322d 1315static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
59c2be1e
YS
1316{
1317 struct ceph_osd_req_op *ops;
0ce1a794 1318 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
57cfc106 1319 int ret;
59c2be1e 1320
57cfc106
AE
1321 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1322 if (!ops)
1323 return -ENOMEM;
59c2be1e
YS
1324
1325 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
0ce1a794 1326 (void *)rbd_dev, &rbd_dev->watch_event);
59c2be1e
YS
1327 if (ret < 0)
1328 goto fail;
1329
0e6f322d 1330 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
0ce1a794 1331 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
59c2be1e
YS
1332 ops[0].watch.flag = 1;
1333
0ce1a794 1334 ret = rbd_req_sync_op(rbd_dev, NULL,
59c2be1e 1335 CEPH_NOSNAP,
59c2be1e
YS
1336 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1337 ops,
0e6f322d
AE
1338 rbd_dev->header_name,
1339 0, 0, NULL,
0ce1a794 1340 &rbd_dev->watch_request, NULL);
59c2be1e
YS
1341
1342 if (ret < 0)
1343 goto fail_event;
1344
1345 rbd_destroy_ops(ops);
1346 return 0;
1347
1348fail_event:
0ce1a794
AE
1349 ceph_osdc_cancel_event(rbd_dev->watch_event);
1350 rbd_dev->watch_event = NULL;
59c2be1e
YS
1351fail:
1352 rbd_destroy_ops(ops);
1353 return ret;
1354}
1355
79e3057c
YS
1356/*
1357 * Request sync osd unwatch
1358 */
070c633f 1359static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
79e3057c
YS
1360{
1361 struct ceph_osd_req_op *ops;
57cfc106 1362 int ret;
79e3057c 1363
57cfc106
AE
1364 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1365 if (!ops)
1366 return -ENOMEM;
79e3057c
YS
1367
1368 ops[0].watch.ver = 0;
0ce1a794 1369 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
79e3057c
YS
1370 ops[0].watch.flag = 0;
1371
0ce1a794 1372 ret = rbd_req_sync_op(rbd_dev, NULL,
79e3057c 1373 CEPH_NOSNAP,
79e3057c
YS
1374 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1375 ops,
070c633f
AE
1376 rbd_dev->header_name,
1377 0, 0, NULL, NULL, NULL);
1378
79e3057c
YS
1379
1380 rbd_destroy_ops(ops);
0ce1a794
AE
1381 ceph_osdc_cancel_event(rbd_dev->watch_event);
1382 rbd_dev->watch_event = NULL;
79e3057c
YS
1383 return ret;
1384}
1385
602adf40 1386/*
3cb4a687 1387 * Synchronous osd object method call
602adf40 1388 */
0ce1a794 1389static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
aded07ea
AE
1390 const char *object_name,
1391 const char *class_name,
1392 const char *method_name,
3cb4a687
AE
1393 const char *outbound,
1394 size_t outbound_size,
f8d4de6e
AE
1395 char *inbound,
1396 size_t inbound_size,
3cb4a687 1397 int flags,
59c2be1e 1398 u64 *ver)
602adf40
YS
1399{
1400 struct ceph_osd_req_op *ops;
aded07ea
AE
1401 int class_name_len = strlen(class_name);
1402 int method_name_len = strlen(method_name);
3cb4a687 1403 int payload_size;
57cfc106
AE
1404 int ret;
1405
3cb4a687
AE
1406 /*
1407 * Any input parameters required by the method we're calling
1408 * will be sent along with the class and method names as
1409 * part of the message payload. That data and its size are
1410 * supplied via the indata and indata_len fields (named from
1411 * the perspective of the server side) in the OSD request
1412 * operation.
1413 */
1414 payload_size = class_name_len + method_name_len + outbound_size;
1415 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
57cfc106
AE
1416 if (!ops)
1417 return -ENOMEM;
602adf40 1418
aded07ea
AE
1419 ops[0].cls.class_name = class_name;
1420 ops[0].cls.class_len = (__u8) class_name_len;
1421 ops[0].cls.method_name = method_name;
1422 ops[0].cls.method_len = (__u8) method_name_len;
602adf40 1423 ops[0].cls.argc = 0;
3cb4a687
AE
1424 ops[0].cls.indata = outbound;
1425 ops[0].cls.indata_len = outbound_size;
602adf40 1426
0ce1a794 1427 ret = rbd_req_sync_op(rbd_dev, NULL,
602adf40 1428 CEPH_NOSNAP,
3cb4a687 1429 flags, ops,
f8d4de6e
AE
1430 object_name, 0, inbound_size, inbound,
1431 NULL, ver);
602adf40
YS
1432
1433 rbd_destroy_ops(ops);
1434
1435 dout("cls_exec returned %d\n", ret);
1436 return ret;
1437}
1438
1fec7093
YS
1439static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1440{
1441 struct rbd_req_coll *coll =
1442 kzalloc(sizeof(struct rbd_req_coll) +
1443 sizeof(struct rbd_req_status) * num_reqs,
1444 GFP_ATOMIC);
1445
1446 if (!coll)
1447 return NULL;
1448 coll->total = num_reqs;
1449 kref_init(&coll->kref);
1450 return coll;
1451}
1452
602adf40
YS
1453/*
1454 * block device queue callback
1455 */
1456static void rbd_rq_fn(struct request_queue *q)
1457{
1458 struct rbd_device *rbd_dev = q->queuedata;
1459 struct request *rq;
1460 struct bio_pair *bp = NULL;
1461
00f1f36f 1462 while ((rq = blk_fetch_request(q))) {
602adf40
YS
1463 struct bio *bio;
1464 struct bio *rq_bio, *next_bio = NULL;
1465 bool do_write;
bd919d45
AE
1466 unsigned int size;
1467 u64 op_size = 0;
602adf40 1468 u64 ofs;
1fec7093
YS
1469 int num_segs, cur_seg = 0;
1470 struct rbd_req_coll *coll;
d1d25646 1471 struct ceph_snap_context *snapc;
602adf40 1472
602adf40
YS
1473 dout("fetched request\n");
1474
1475 /* filter out block requests we don't understand */
1476 if ((rq->cmd_type != REQ_TYPE_FS)) {
1477 __blk_end_request_all(rq, 0);
00f1f36f 1478 continue;
602adf40
YS
1479 }
1480
1481 /* deduce our operation (read, write) */
1482 do_write = (rq_data_dir(rq) == WRITE);
1483
1484 size = blk_rq_bytes(rq);
593a9e7b 1485 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
602adf40 1486 rq_bio = rq->bio;
f84344f3 1487 if (do_write && rbd_dev->mapping.read_only) {
602adf40 1488 __blk_end_request_all(rq, -EROFS);
00f1f36f 1489 continue;
602adf40
YS
1490 }
1491
1492 spin_unlock_irq(q->queue_lock);
1493
d1d25646 1494 down_read(&rbd_dev->header_rwsem);
e88a36ec 1495
f84344f3
AE
1496 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1497 !rbd_dev->mapping.snap_exists) {
e88a36ec 1498 up_read(&rbd_dev->header_rwsem);
d1d25646
JD
1499 dout("request for non-existent snapshot");
1500 spin_lock_irq(q->queue_lock);
1501 __blk_end_request_all(rq, -ENXIO);
1502 continue;
e88a36ec
JD
1503 }
1504
d1d25646
JD
1505 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1506
1507 up_read(&rbd_dev->header_rwsem);
1508
602adf40
YS
1509 dout("%s 0x%x bytes at 0x%llx\n",
1510 do_write ? "write" : "read",
bd919d45 1511 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
602adf40 1512
1fec7093 1513 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
df111be6
AE
1514 if (num_segs <= 0) {
1515 spin_lock_irq(q->queue_lock);
1516 __blk_end_request_all(rq, num_segs);
1517 ceph_put_snap_context(snapc);
1518 continue;
1519 }
1fec7093
YS
1520 coll = rbd_alloc_coll(num_segs);
1521 if (!coll) {
1522 spin_lock_irq(q->queue_lock);
1523 __blk_end_request_all(rq, -ENOMEM);
d1d25646 1524 ceph_put_snap_context(snapc);
00f1f36f 1525 continue;
1fec7093
YS
1526 }
1527
602adf40
YS
1528 do {
1529 /* a bio clone to be passed down to OSD req */
bd919d45 1530 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
65ccfe21 1531 op_size = rbd_segment_length(rbd_dev, ofs, size);
1fec7093 1532 kref_get(&coll->kref);
602adf40
YS
1533 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1534 op_size, GFP_ATOMIC);
1535 if (!bio) {
1fec7093
YS
1536 rbd_coll_end_req_index(rq, coll, cur_seg,
1537 -ENOMEM, op_size);
1538 goto next_seg;
602adf40
YS
1539 }
1540
1fec7093 1541
602adf40
YS
1542 /* init OSD command: write or read */
1543 if (do_write)
1544 rbd_req_write(rq, rbd_dev,
d1d25646 1545 snapc,
602adf40 1546 ofs,
1fec7093
YS
1547 op_size, bio,
1548 coll, cur_seg);
602adf40
YS
1549 else
1550 rbd_req_read(rq, rbd_dev,
f84344f3 1551 rbd_dev->mapping.snap_id,
602adf40 1552 ofs,
1fec7093
YS
1553 op_size, bio,
1554 coll, cur_seg);
602adf40 1555
1fec7093 1556next_seg:
602adf40
YS
1557 size -= op_size;
1558 ofs += op_size;
1559
1fec7093 1560 cur_seg++;
602adf40
YS
1561 rq_bio = next_bio;
1562 } while (size > 0);
1fec7093 1563 kref_put(&coll->kref, rbd_coll_release);
602adf40
YS
1564
1565 if (bp)
1566 bio_pair_release(bp);
602adf40 1567 spin_lock_irq(q->queue_lock);
d1d25646
JD
1568
1569 ceph_put_snap_context(snapc);
602adf40
YS
1570 }
1571}
1572
1573/*
1574 * a queue callback. Makes sure that we don't create a bio that spans across
1575 * multiple osd objects. One exception would be with a single page bios,
1576 * which we handle later at bio_chain_clone
1577 */
1578static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1579 struct bio_vec *bvec)
1580{
1581 struct rbd_device *rbd_dev = q->queuedata;
593a9e7b
AE
1582 unsigned int chunk_sectors;
1583 sector_t sector;
1584 unsigned int bio_sectors;
602adf40
YS
1585 int max;
1586
593a9e7b
AE
1587 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1588 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1589 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1590
602adf40 1591 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
593a9e7b 1592 + bio_sectors)) << SECTOR_SHIFT;
602adf40
YS
1593 if (max < 0)
1594 max = 0; /* bio_add cannot handle a negative return */
1595 if (max <= bvec->bv_len && bio_sectors == 0)
1596 return bvec->bv_len;
1597 return max;
1598}
1599
1600static void rbd_free_disk(struct rbd_device *rbd_dev)
1601{
1602 struct gendisk *disk = rbd_dev->disk;
1603
1604 if (!disk)
1605 return;
1606
602adf40
YS
1607 if (disk->flags & GENHD_FL_UP)
1608 del_gendisk(disk);
1609 if (disk->queue)
1610 blk_cleanup_queue(disk->queue);
1611 put_disk(disk);
1612}
1613
1614/*
4156d998
AE
1615 * Read the complete header for the given rbd device.
1616 *
1617 * Returns a pointer to a dynamically-allocated buffer containing
1618 * the complete and validated header. Caller can pass the address
1619 * of a variable that will be filled in with the version of the
1620 * header object at the time it was read.
1621 *
1622 * Returns a pointer-coded errno if a failure occurs.
602adf40 1623 */
4156d998
AE
1624static struct rbd_image_header_ondisk *
1625rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
602adf40 1626{
4156d998 1627 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 1628 u32 snap_count = 0;
4156d998
AE
1629 u64 names_size = 0;
1630 u32 want_count;
1631 int ret;
602adf40 1632
00f1f36f 1633 /*
4156d998
AE
1634 * The complete header will include an array of its 64-bit
1635 * snapshot ids, followed by the names of those snapshots as
1636 * a contiguous block of NUL-terminated strings. Note that
1637 * the number of snapshots could change by the time we read
1638 * it in, in which case we re-read it.
00f1f36f 1639 */
4156d998
AE
1640 do {
1641 size_t size;
1642
1643 kfree(ondisk);
1644
1645 size = sizeof (*ondisk);
1646 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1647 size += names_size;
1648 ondisk = kmalloc(size, GFP_KERNEL);
1649 if (!ondisk)
1650 return ERR_PTR(-ENOMEM);
1651
1652 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
0bed54dc 1653 rbd_dev->header_name,
4156d998
AE
1654 0, size,
1655 (char *) ondisk, version);
1656
1657 if (ret < 0)
1658 goto out_err;
1659 if (WARN_ON((size_t) ret < size)) {
1660 ret = -ENXIO;
1661 pr_warning("short header read for image %s"
1662 " (want %zd got %d)\n",
1663 rbd_dev->image_name, size, ret);
1664 goto out_err;
1665 }
1666 if (!rbd_dev_ondisk_valid(ondisk)) {
1667 ret = -ENXIO;
1668 pr_warning("invalid header for image %s\n",
1669 rbd_dev->image_name);
1670 goto out_err;
81e759fb 1671 }
602adf40 1672
4156d998
AE
1673 names_size = le64_to_cpu(ondisk->snap_names_len);
1674 want_count = snap_count;
1675 snap_count = le32_to_cpu(ondisk->snap_count);
1676 } while (snap_count != want_count);
00f1f36f 1677
4156d998 1678 return ondisk;
00f1f36f 1679
4156d998
AE
1680out_err:
1681 kfree(ondisk);
1682
1683 return ERR_PTR(ret);
1684}
1685
1686/*
1687 * reload the ondisk the header
1688 */
1689static int rbd_read_header(struct rbd_device *rbd_dev,
1690 struct rbd_image_header *header)
1691{
1692 struct rbd_image_header_ondisk *ondisk;
1693 u64 ver = 0;
1694 int ret;
602adf40 1695
4156d998
AE
1696 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1697 if (IS_ERR(ondisk))
1698 return PTR_ERR(ondisk);
1699 ret = rbd_header_from_disk(header, ondisk);
1700 if (ret >= 0)
1701 header->obj_version = ver;
1702 kfree(ondisk);
1703
1704 return ret;
602adf40
YS
1705}
1706
dfc5606d
YS
1707static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1708{
1709 struct rbd_snap *snap;
a0593290 1710 struct rbd_snap *next;
dfc5606d 1711
a0593290 1712 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
14e7085d 1713 __rbd_remove_snap_dev(snap);
dfc5606d
YS
1714}
1715
602adf40
YS
1716/*
1717 * only read the first part of the ondisk header, without the snaps info
1718 */
b813623a 1719static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
602adf40
YS
1720{
1721 int ret;
1722 struct rbd_image_header h;
602adf40
YS
1723
1724 ret = rbd_read_header(rbd_dev, &h);
1725 if (ret < 0)
1726 return ret;
1727
a51aa0c0
JD
1728 down_write(&rbd_dev->header_rwsem);
1729
9db4b3e3 1730 /* resized? */
f84344f3 1731 if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) {
474ef7ce
JD
1732 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1733
99c1f08f
AE
1734 if (size != (sector_t) rbd_dev->mapping.size) {
1735 dout("setting size to %llu sectors",
1736 (unsigned long long) size);
1737 rbd_dev->mapping.size = (u64) size;
1738 set_capacity(rbd_dev->disk, size);
1739 }
474ef7ce 1740 }
9db4b3e3 1741
849b4260 1742 /* rbd_dev->header.object_prefix shouldn't change */
602adf40 1743 kfree(rbd_dev->header.snap_sizes);
849b4260 1744 kfree(rbd_dev->header.snap_names);
d1d25646
JD
1745 /* osd requests may still refer to snapc */
1746 ceph_put_snap_context(rbd_dev->header.snapc);
602adf40 1747
b813623a
AE
1748 if (hver)
1749 *hver = h.obj_version;
a71b891b 1750 rbd_dev->header.obj_version = h.obj_version;
93a24e08 1751 rbd_dev->header.image_size = h.image_size;
602adf40
YS
1752 rbd_dev->header.snapc = h.snapc;
1753 rbd_dev->header.snap_names = h.snap_names;
1754 rbd_dev->header.snap_sizes = h.snap_sizes;
849b4260
AE
1755 /* Free the extra copy of the object prefix */
1756 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1757 kfree(h.object_prefix);
1758
304f6808
AE
1759 ret = rbd_dev_snaps_update(rbd_dev);
1760 if (!ret)
1761 ret = rbd_dev_snaps_register(rbd_dev);
dfc5606d 1762
c666601a 1763 up_write(&rbd_dev->header_rwsem);
602adf40 1764
dfc5606d 1765 return ret;
602adf40
YS
1766}
1767
1fe5e993
AE
1768static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1769{
1770 int ret;
1771
1772 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1773 ret = __rbd_refresh_header(rbd_dev, hver);
1774 mutex_unlock(&ctl_mutex);
1775
1776 return ret;
1777}
1778
602adf40
YS
1779static int rbd_init_disk(struct rbd_device *rbd_dev)
1780{
1781 struct gendisk *disk;
1782 struct request_queue *q;
593a9e7b 1783 u64 segment_size;
602adf40 1784
602adf40 1785 /* create gendisk info */
602adf40
YS
1786 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1787 if (!disk)
1fcdb8aa 1788 return -ENOMEM;
602adf40 1789
f0f8cef5 1790 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 1791 rbd_dev->dev_id);
602adf40
YS
1792 disk->major = rbd_dev->major;
1793 disk->first_minor = 0;
1794 disk->fops = &rbd_bd_ops;
1795 disk->private_data = rbd_dev;
1796
1797 /* init rq */
602adf40
YS
1798 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1799 if (!q)
1800 goto out_disk;
029bcbd8 1801
593a9e7b
AE
1802 /* We use the default size, but let's be explicit about it. */
1803 blk_queue_physical_block_size(q, SECTOR_SIZE);
1804
029bcbd8 1805 /* set io sizes to object size */
593a9e7b
AE
1806 segment_size = rbd_obj_bytes(&rbd_dev->header);
1807 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1808 blk_queue_max_segment_size(q, segment_size);
1809 blk_queue_io_min(q, segment_size);
1810 blk_queue_io_opt(q, segment_size);
029bcbd8 1811
602adf40
YS
1812 blk_queue_merge_bvec(q, rbd_merge_bvec);
1813 disk->queue = q;
1814
1815 q->queuedata = rbd_dev;
1816
1817 rbd_dev->disk = disk;
602adf40 1818
12f02944
AE
1819 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1820
602adf40 1821 return 0;
602adf40
YS
1822out_disk:
1823 put_disk(disk);
1fcdb8aa
AE
1824
1825 return -ENOMEM;
602adf40
YS
1826}
1827
dfc5606d
YS
1828/*
1829 sysfs
1830*/
1831
593a9e7b
AE
1832static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1833{
1834 return container_of(dev, struct rbd_device, dev);
1835}
1836
dfc5606d
YS
1837static ssize_t rbd_size_show(struct device *dev,
1838 struct device_attribute *attr, char *buf)
1839{
593a9e7b 1840 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0
JD
1841 sector_t size;
1842
1843 down_read(&rbd_dev->header_rwsem);
1844 size = get_capacity(rbd_dev->disk);
1845 up_read(&rbd_dev->header_rwsem);
dfc5606d 1846
a51aa0c0 1847 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
dfc5606d
YS
1848}
1849
34b13184
AE
1850/*
1851 * Note this shows the features for whatever's mapped, which is not
1852 * necessarily the base image.
1853 */
1854static ssize_t rbd_features_show(struct device *dev,
1855 struct device_attribute *attr, char *buf)
1856{
1857 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1858
1859 return sprintf(buf, "0x%016llx\n",
1860 (unsigned long long) rbd_dev->mapping.features);
1861}
1862
dfc5606d
YS
1863static ssize_t rbd_major_show(struct device *dev,
1864 struct device_attribute *attr, char *buf)
1865{
593a9e7b 1866 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 1867
dfc5606d
YS
1868 return sprintf(buf, "%d\n", rbd_dev->major);
1869}
1870
1871static ssize_t rbd_client_id_show(struct device *dev,
1872 struct device_attribute *attr, char *buf)
602adf40 1873{
593a9e7b 1874 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1875
1dbb4399
AE
1876 return sprintf(buf, "client%lld\n",
1877 ceph_client_id(rbd_dev->rbd_client->client));
602adf40
YS
1878}
1879
dfc5606d
YS
1880static ssize_t rbd_pool_show(struct device *dev,
1881 struct device_attribute *attr, char *buf)
602adf40 1882{
593a9e7b 1883 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d
YS
1884
1885 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1886}
1887
9bb2f334
AE
1888static ssize_t rbd_pool_id_show(struct device *dev,
1889 struct device_attribute *attr, char *buf)
1890{
1891 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1892
1893 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1894}
1895
dfc5606d
YS
1896static ssize_t rbd_name_show(struct device *dev,
1897 struct device_attribute *attr, char *buf)
1898{
593a9e7b 1899 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1900
0bed54dc 1901 return sprintf(buf, "%s\n", rbd_dev->image_name);
dfc5606d
YS
1902}
1903
589d30e0
AE
1904static ssize_t rbd_image_id_show(struct device *dev,
1905 struct device_attribute *attr, char *buf)
1906{
1907 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1908
1909 return sprintf(buf, "%s\n", rbd_dev->image_id);
1910}
1911
34b13184
AE
1912/*
1913 * Shows the name of the currently-mapped snapshot (or
1914 * RBD_SNAP_HEAD_NAME for the base image).
1915 */
dfc5606d
YS
1916static ssize_t rbd_snap_show(struct device *dev,
1917 struct device_attribute *attr,
1918 char *buf)
1919{
593a9e7b 1920 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1921
f84344f3 1922 return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
dfc5606d
YS
1923}
1924
1925static ssize_t rbd_image_refresh(struct device *dev,
1926 struct device_attribute *attr,
1927 const char *buf,
1928 size_t size)
1929{
593a9e7b 1930 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 1931 int ret;
602adf40 1932
1fe5e993 1933 ret = rbd_refresh_header(rbd_dev, NULL);
b813623a
AE
1934
1935 return ret < 0 ? ret : size;
dfc5606d 1936}
602adf40 1937
dfc5606d 1938static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
34b13184 1939static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
dfc5606d
YS
1940static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1941static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1942static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
9bb2f334 1943static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
dfc5606d 1944static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
589d30e0 1945static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
dfc5606d
YS
1946static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1947static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
dfc5606d
YS
1948
1949static struct attribute *rbd_attrs[] = {
1950 &dev_attr_size.attr,
34b13184 1951 &dev_attr_features.attr,
dfc5606d
YS
1952 &dev_attr_major.attr,
1953 &dev_attr_client_id.attr,
1954 &dev_attr_pool.attr,
9bb2f334 1955 &dev_attr_pool_id.attr,
dfc5606d 1956 &dev_attr_name.attr,
589d30e0 1957 &dev_attr_image_id.attr,
dfc5606d
YS
1958 &dev_attr_current_snap.attr,
1959 &dev_attr_refresh.attr,
dfc5606d
YS
1960 NULL
1961};
1962
1963static struct attribute_group rbd_attr_group = {
1964 .attrs = rbd_attrs,
1965};
1966
1967static const struct attribute_group *rbd_attr_groups[] = {
1968 &rbd_attr_group,
1969 NULL
1970};
1971
1972static void rbd_sysfs_dev_release(struct device *dev)
1973{
1974}
1975
1976static struct device_type rbd_device_type = {
1977 .name = "rbd",
1978 .groups = rbd_attr_groups,
1979 .release = rbd_sysfs_dev_release,
1980};
1981
1982
1983/*
1984 sysfs - snapshots
1985*/
1986
1987static ssize_t rbd_snap_size_show(struct device *dev,
1988 struct device_attribute *attr,
1989 char *buf)
1990{
1991 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1992
3591538f 1993 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
dfc5606d
YS
1994}
1995
1996static ssize_t rbd_snap_id_show(struct device *dev,
1997 struct device_attribute *attr,
1998 char *buf)
1999{
2000 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2001
3591538f 2002 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
dfc5606d
YS
2003}
2004
34b13184
AE
2005static ssize_t rbd_snap_features_show(struct device *dev,
2006 struct device_attribute *attr,
2007 char *buf)
2008{
2009 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2010
2011 return sprintf(buf, "0x%016llx\n",
2012 (unsigned long long) snap->features);
2013}
2014
dfc5606d
YS
2015static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2016static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
34b13184 2017static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
dfc5606d
YS
2018
2019static struct attribute *rbd_snap_attrs[] = {
2020 &dev_attr_snap_size.attr,
2021 &dev_attr_snap_id.attr,
34b13184 2022 &dev_attr_snap_features.attr,
dfc5606d
YS
2023 NULL,
2024};
2025
2026static struct attribute_group rbd_snap_attr_group = {
2027 .attrs = rbd_snap_attrs,
2028};
2029
2030static void rbd_snap_dev_release(struct device *dev)
2031{
2032 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2033 kfree(snap->name);
2034 kfree(snap);
2035}
2036
2037static const struct attribute_group *rbd_snap_attr_groups[] = {
2038 &rbd_snap_attr_group,
2039 NULL
2040};
2041
2042static struct device_type rbd_snap_device_type = {
2043 .groups = rbd_snap_attr_groups,
2044 .release = rbd_snap_dev_release,
2045};
2046
304f6808
AE
2047static bool rbd_snap_registered(struct rbd_snap *snap)
2048{
2049 bool ret = snap->dev.type == &rbd_snap_device_type;
2050 bool reg = device_is_registered(&snap->dev);
2051
2052 rbd_assert(!ret ^ reg);
2053
2054 return ret;
2055}
2056
14e7085d 2057static void __rbd_remove_snap_dev(struct rbd_snap *snap)
dfc5606d
YS
2058{
2059 list_del(&snap->node);
304f6808
AE
2060 if (device_is_registered(&snap->dev))
2061 device_unregister(&snap->dev);
dfc5606d
YS
2062}
2063
14e7085d 2064static int rbd_register_snap_dev(struct rbd_snap *snap,
dfc5606d
YS
2065 struct device *parent)
2066{
2067 struct device *dev = &snap->dev;
2068 int ret;
2069
2070 dev->type = &rbd_snap_device_type;
2071 dev->parent = parent;
2072 dev->release = rbd_snap_dev_release;
2073 dev_set_name(dev, "snap_%s", snap->name);
304f6808
AE
2074 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2075
dfc5606d
YS
2076 ret = device_register(dev);
2077
2078 return ret;
2079}
2080
4e891e0a 2081static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
c8d18425 2082 const char *snap_name,
34b13184
AE
2083 u64 snap_id, u64 snap_size,
2084 u64 snap_features)
dfc5606d 2085{
4e891e0a 2086 struct rbd_snap *snap;
dfc5606d 2087 int ret;
4e891e0a
AE
2088
2089 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
dfc5606d 2090 if (!snap)
4e891e0a
AE
2091 return ERR_PTR(-ENOMEM);
2092
2093 ret = -ENOMEM;
c8d18425 2094 snap->name = kstrdup(snap_name, GFP_KERNEL);
4e891e0a
AE
2095 if (!snap->name)
2096 goto err;
2097
c8d18425
AE
2098 snap->id = snap_id;
2099 snap->size = snap_size;
34b13184 2100 snap->features = snap_features;
4e891e0a
AE
2101
2102 return snap;
2103
dfc5606d
YS
2104err:
2105 kfree(snap->name);
2106 kfree(snap);
4e891e0a
AE
2107
2108 return ERR_PTR(ret);
dfc5606d
YS
2109}
2110
cd892126
AE
2111static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2112 u64 *snap_size, u64 *snap_features)
2113{
2114 char *snap_name;
2115
2116 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2117
2118 *snap_size = rbd_dev->header.snap_sizes[which];
2119 *snap_features = 0; /* No features for v1 */
2120
2121 /* Skip over names until we find the one we are looking for */
2122
2123 snap_name = rbd_dev->header.snap_names;
2124 while (which--)
2125 snap_name += strlen(snap_name) + 1;
2126
2127 return snap_name;
2128}
2129
9d475de5
AE
2130/*
2131 * Get the size and object order for an image snapshot, or if
2132 * snap_id is CEPH_NOSNAP, gets this information for the base
2133 * image.
2134 */
2135static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2136 u8 *order, u64 *snap_size)
2137{
2138 __le64 snapid = cpu_to_le64(snap_id);
2139 int ret;
2140 struct {
2141 u8 order;
2142 __le64 size;
2143 } __attribute__ ((packed)) size_buf = { 0 };
2144
2145 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2146 "rbd", "get_size",
2147 (char *) &snapid, sizeof (snapid),
2148 (char *) &size_buf, sizeof (size_buf),
2149 CEPH_OSD_FLAG_READ, NULL);
2150 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2151 if (ret < 0)
2152 return ret;
2153
2154 *order = size_buf.order;
2155 *snap_size = le64_to_cpu(size_buf.size);
2156
2157 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2158 (unsigned long long) snap_id, (unsigned int) *order,
2159 (unsigned long long) *snap_size);
2160
2161 return 0;
2162}
2163
2164static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2165{
2166 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2167 &rbd_dev->header.obj_order,
2168 &rbd_dev->header.image_size);
2169}
2170
dfc5606d 2171/*
35938150
AE
2172 * Scan the rbd device's current snapshot list and compare it to the
2173 * newly-received snapshot context. Remove any existing snapshots
2174 * not present in the new snapshot context. Add a new snapshot for
2175 * any snaphots in the snapshot context not in the current list.
2176 * And verify there are no changes to snapshots we already know
2177 * about.
2178 *
2179 * Assumes the snapshots in the snapshot context are sorted by
2180 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2181 * are also maintained in that order.)
dfc5606d 2182 */
304f6808 2183static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
dfc5606d 2184{
35938150
AE
2185 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2186 const u32 snap_count = snapc->num_snaps;
35938150
AE
2187 struct list_head *head = &rbd_dev->snaps;
2188 struct list_head *links = head->next;
2189 u32 index = 0;
dfc5606d 2190
9fcbb800 2191 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
35938150
AE
2192 while (index < snap_count || links != head) {
2193 u64 snap_id;
2194 struct rbd_snap *snap;
cd892126
AE
2195 char *snap_name;
2196 u64 snap_size = 0;
2197 u64 snap_features = 0;
dfc5606d 2198
35938150
AE
2199 snap_id = index < snap_count ? snapc->snaps[index]
2200 : CEPH_NOSNAP;
2201 snap = links != head ? list_entry(links, struct rbd_snap, node)
2202 : NULL;
aafb230e 2203 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
dfc5606d 2204
35938150
AE
2205 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2206 struct list_head *next = links->next;
dfc5606d 2207
35938150 2208 /* Existing snapshot not in the new snap context */
dfc5606d 2209
f84344f3
AE
2210 if (rbd_dev->mapping.snap_id == snap->id)
2211 rbd_dev->mapping.snap_exists = false;
35938150 2212 __rbd_remove_snap_dev(snap);
9fcbb800 2213 dout("%ssnap id %llu has been removed\n",
f84344f3
AE
2214 rbd_dev->mapping.snap_id == snap->id ?
2215 "mapped " : "",
9fcbb800 2216 (unsigned long long) snap->id);
35938150
AE
2217
2218 /* Done with this list entry; advance */
2219
2220 links = next;
dfc5606d
YS
2221 continue;
2222 }
35938150 2223
cd892126
AE
2224 snap_name = rbd_dev_v1_snap_info(rbd_dev, index,
2225 &snap_size, &snap_features);
2226 if (IS_ERR(snap_name))
2227 return PTR_ERR(snap_name);
2228
9fcbb800
AE
2229 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2230 (unsigned long long) snap_id);
35938150
AE
2231 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2232 struct rbd_snap *new_snap;
2233
2234 /* We haven't seen this snapshot before */
2235
c8d18425 2236 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
cd892126 2237 snap_id, snap_size, snap_features);
9fcbb800
AE
2238 if (IS_ERR(new_snap)) {
2239 int err = PTR_ERR(new_snap);
2240
2241 dout(" failed to add dev, error %d\n", err);
2242
2243 return err;
2244 }
35938150
AE
2245
2246 /* New goes before existing, or at end of list */
2247
9fcbb800 2248 dout(" added dev%s\n", snap ? "" : " at end\n");
35938150
AE
2249 if (snap)
2250 list_add_tail(&new_snap->node, &snap->node);
2251 else
523f3258 2252 list_add_tail(&new_snap->node, head);
35938150
AE
2253 } else {
2254 /* Already have this one */
2255
9fcbb800
AE
2256 dout(" already present\n");
2257
cd892126 2258 rbd_assert(snap->size == snap_size);
aafb230e 2259 rbd_assert(!strcmp(snap->name, snap_name));
cd892126 2260 rbd_assert(snap->features == snap_features);
35938150
AE
2261
2262 /* Done with this list entry; advance */
2263
2264 links = links->next;
dfc5606d 2265 }
35938150
AE
2266
2267 /* Advance to the next entry in the snapshot context */
2268
2269 index++;
dfc5606d 2270 }
9fcbb800 2271 dout("%s: done\n", __func__);
dfc5606d
YS
2272
2273 return 0;
2274}
2275
304f6808
AE
2276/*
2277 * Scan the list of snapshots and register the devices for any that
2278 * have not already been registered.
2279 */
2280static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2281{
2282 struct rbd_snap *snap;
2283 int ret = 0;
2284
2285 dout("%s called\n", __func__);
86ff77bb
AE
2286 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2287 return -EIO;
304f6808
AE
2288
2289 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2290 if (!rbd_snap_registered(snap)) {
2291 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2292 if (ret < 0)
2293 break;
2294 }
2295 }
2296 dout("%s: returning %d\n", __func__, ret);
2297
2298 return ret;
2299}
2300
dfc5606d
YS
2301static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2302{
dfc5606d 2303 struct device *dev;
cd789ab9 2304 int ret;
dfc5606d
YS
2305
2306 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
dfc5606d 2307
cd789ab9 2308 dev = &rbd_dev->dev;
dfc5606d
YS
2309 dev->bus = &rbd_bus_type;
2310 dev->type = &rbd_device_type;
2311 dev->parent = &rbd_root_dev;
2312 dev->release = rbd_dev_release;
de71a297 2313 dev_set_name(dev, "%d", rbd_dev->dev_id);
dfc5606d 2314 ret = device_register(dev);
dfc5606d 2315
dfc5606d 2316 mutex_unlock(&ctl_mutex);
cd789ab9 2317
dfc5606d 2318 return ret;
602adf40
YS
2319}
2320
dfc5606d
YS
2321static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2322{
2323 device_unregister(&rbd_dev->dev);
2324}
2325
59c2be1e
YS
2326static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2327{
2328 int ret, rc;
2329
2330 do {
0e6f322d 2331 ret = rbd_req_sync_watch(rbd_dev);
59c2be1e 2332 if (ret == -ERANGE) {
1fe5e993 2333 rc = rbd_refresh_header(rbd_dev, NULL);
59c2be1e
YS
2334 if (rc < 0)
2335 return rc;
2336 }
2337 } while (ret == -ERANGE);
2338
2339 return ret;
2340}
2341
e2839308 2342static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
1ddbe94e
AE
2343
2344/*
499afd5b
AE
2345 * Get a unique rbd identifier for the given new rbd_dev, and add
2346 * the rbd_dev to the global list. The minimum rbd id is 1.
1ddbe94e 2347 */
e2839308 2348static void rbd_dev_id_get(struct rbd_device *rbd_dev)
b7f23c36 2349{
e2839308 2350 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
499afd5b
AE
2351
2352 spin_lock(&rbd_dev_list_lock);
2353 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2354 spin_unlock(&rbd_dev_list_lock);
e2839308
AE
2355 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2356 (unsigned long long) rbd_dev->dev_id);
1ddbe94e 2357}
b7f23c36 2358
1ddbe94e 2359/*
499afd5b
AE
2360 * Remove an rbd_dev from the global list, and record that its
2361 * identifier is no longer in use.
1ddbe94e 2362 */
e2839308 2363static void rbd_dev_id_put(struct rbd_device *rbd_dev)
1ddbe94e 2364{
d184f6bf 2365 struct list_head *tmp;
de71a297 2366 int rbd_id = rbd_dev->dev_id;
d184f6bf
AE
2367 int max_id;
2368
aafb230e 2369 rbd_assert(rbd_id > 0);
499afd5b 2370
e2839308
AE
2371 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2372 (unsigned long long) rbd_dev->dev_id);
499afd5b
AE
2373 spin_lock(&rbd_dev_list_lock);
2374 list_del_init(&rbd_dev->node);
d184f6bf
AE
2375
2376 /*
2377 * If the id being "put" is not the current maximum, there
2378 * is nothing special we need to do.
2379 */
e2839308 2380 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
d184f6bf
AE
2381 spin_unlock(&rbd_dev_list_lock);
2382 return;
2383 }
2384
2385 /*
2386 * We need to update the current maximum id. Search the
2387 * list to find out what it is. We're more likely to find
2388 * the maximum at the end, so search the list backward.
2389 */
2390 max_id = 0;
2391 list_for_each_prev(tmp, &rbd_dev_list) {
2392 struct rbd_device *rbd_dev;
2393
2394 rbd_dev = list_entry(tmp, struct rbd_device, node);
2395 if (rbd_id > max_id)
2396 max_id = rbd_id;
2397 }
499afd5b 2398 spin_unlock(&rbd_dev_list_lock);
b7f23c36 2399
1ddbe94e 2400 /*
e2839308 2401 * The max id could have been updated by rbd_dev_id_get(), in
d184f6bf
AE
2402 * which case it now accurately reflects the new maximum.
2403 * Be careful not to overwrite the maximum value in that
2404 * case.
1ddbe94e 2405 */
e2839308
AE
2406 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2407 dout(" max dev id has been reset\n");
b7f23c36
AE
2408}
2409
e28fff26
AE
2410/*
2411 * Skips over white space at *buf, and updates *buf to point to the
2412 * first found non-space character (if any). Returns the length of
593a9e7b
AE
2413 * the token (string of non-white space characters) found. Note
2414 * that *buf must be terminated with '\0'.
e28fff26
AE
2415 */
2416static inline size_t next_token(const char **buf)
2417{
2418 /*
2419 * These are the characters that produce nonzero for
2420 * isspace() in the "C" and "POSIX" locales.
2421 */
2422 const char *spaces = " \f\n\r\t\v";
2423
2424 *buf += strspn(*buf, spaces); /* Find start of token */
2425
2426 return strcspn(*buf, spaces); /* Return token length */
2427}
2428
2429/*
2430 * Finds the next token in *buf, and if the provided token buffer is
2431 * big enough, copies the found token into it. The result, if
593a9e7b
AE
2432 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2433 * must be terminated with '\0' on entry.
e28fff26
AE
2434 *
2435 * Returns the length of the token found (not including the '\0').
2436 * Return value will be 0 if no token is found, and it will be >=
2437 * token_size if the token would not fit.
2438 *
593a9e7b 2439 * The *buf pointer will be updated to point beyond the end of the
e28fff26
AE
2440 * found token. Note that this occurs even if the token buffer is
2441 * too small to hold it.
2442 */
2443static inline size_t copy_token(const char **buf,
2444 char *token,
2445 size_t token_size)
2446{
2447 size_t len;
2448
2449 len = next_token(buf);
2450 if (len < token_size) {
2451 memcpy(token, *buf, len);
2452 *(token + len) = '\0';
2453 }
2454 *buf += len;
2455
2456 return len;
2457}
2458
ea3352f4
AE
2459/*
2460 * Finds the next token in *buf, dynamically allocates a buffer big
2461 * enough to hold a copy of it, and copies the token into the new
2462 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2463 * that a duplicate buffer is created even for a zero-length token.
2464 *
2465 * Returns a pointer to the newly-allocated duplicate, or a null
2466 * pointer if memory for the duplicate was not available. If
2467 * the lenp argument is a non-null pointer, the length of the token
2468 * (not including the '\0') is returned in *lenp.
2469 *
2470 * If successful, the *buf pointer will be updated to point beyond
2471 * the end of the found token.
2472 *
2473 * Note: uses GFP_KERNEL for allocation.
2474 */
2475static inline char *dup_token(const char **buf, size_t *lenp)
2476{
2477 char *dup;
2478 size_t len;
2479
2480 len = next_token(buf);
2481 dup = kmalloc(len + 1, GFP_KERNEL);
2482 if (!dup)
2483 return NULL;
2484
2485 memcpy(dup, *buf, len);
2486 *(dup + len) = '\0';
2487 *buf += len;
2488
2489 if (lenp)
2490 *lenp = len;
2491
2492 return dup;
2493}
2494
a725f65e 2495/*
3feeb894
AE
2496 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2497 * rbd_md_name, and name fields of the given rbd_dev, based on the
2498 * list of monitor addresses and other options provided via
2499 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2500 * copy of the snapshot name to map if successful, or a
2501 * pointer-coded error otherwise.
d22f76e7
AE
2502 *
2503 * Note: rbd_dev is assumed to have been initially zero-filled.
a725f65e 2504 */
3feeb894
AE
2505static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2506 const char *buf,
2507 const char **mon_addrs,
2508 size_t *mon_addrs_size,
2509 char *options,
2510 size_t options_size)
e28fff26 2511{
d22f76e7 2512 size_t len;
3feeb894
AE
2513 char *err_ptr = ERR_PTR(-EINVAL);
2514 char *snap_name;
e28fff26
AE
2515
2516 /* The first four tokens are required */
2517
7ef3214a
AE
2518 len = next_token(&buf);
2519 if (!len)
3feeb894 2520 return err_ptr;
5214ecc4 2521 *mon_addrs_size = len + 1;
7ef3214a
AE
2522 *mon_addrs = buf;
2523
2524 buf += len;
a725f65e 2525
e28fff26
AE
2526 len = copy_token(&buf, options, options_size);
2527 if (!len || len >= options_size)
3feeb894 2528 return err_ptr;
e28fff26 2529
3feeb894 2530 err_ptr = ERR_PTR(-ENOMEM);
d22f76e7
AE
2531 rbd_dev->pool_name = dup_token(&buf, NULL);
2532 if (!rbd_dev->pool_name)
d22f76e7 2533 goto out_err;
e28fff26 2534
0bed54dc
AE
2535 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2536 if (!rbd_dev->image_name)
bf3e5ae1 2537 goto out_err;
a725f65e 2538
3feeb894
AE
2539 /* Snapshot name is optional */
2540 len = next_token(&buf);
820a5f3e 2541 if (!len) {
3feeb894
AE
2542 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2543 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
849b4260 2544 }
3feeb894
AE
2545 snap_name = kmalloc(len + 1, GFP_KERNEL);
2546 if (!snap_name)
2547 goto out_err;
2548 memcpy(snap_name, buf, len);
2549 *(snap_name + len) = '\0';
e28fff26 2550
3feeb894
AE
2551dout(" SNAP_NAME is <%s>, len is %zd\n", snap_name, len);
2552
2553 return snap_name;
d22f76e7
AE
2554
2555out_err:
0bed54dc 2556 kfree(rbd_dev->image_name);
d78fd7ae
AE
2557 rbd_dev->image_name = NULL;
2558 rbd_dev->image_name_len = 0;
d22f76e7
AE
2559 kfree(rbd_dev->pool_name);
2560 rbd_dev->pool_name = NULL;
2561
3feeb894 2562 return err_ptr;
a725f65e
AE
2563}
2564
589d30e0
AE
2565/*
2566 * An rbd format 2 image has a unique identifier, distinct from the
2567 * name given to it by the user. Internally, that identifier is
2568 * what's used to specify the names of objects related to the image.
2569 *
2570 * A special "rbd id" object is used to map an rbd image name to its
2571 * id. If that object doesn't exist, then there is no v2 rbd image
2572 * with the supplied name.
2573 *
2574 * This function will record the given rbd_dev's image_id field if
2575 * it can be determined, and in that case will return 0. If any
2576 * errors occur a negative errno will be returned and the rbd_dev's
2577 * image_id field will be unchanged (and should be NULL).
2578 */
2579static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2580{
2581 int ret;
2582 size_t size;
2583 char *object_name;
2584 void *response;
2585 void *p;
2586
2587 /*
2588 * First, see if the format 2 image id file exists, and if
2589 * so, get the image's persistent id from it.
2590 */
2591 size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2592 object_name = kmalloc(size, GFP_NOIO);
2593 if (!object_name)
2594 return -ENOMEM;
2595 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2596 dout("rbd id object name is %s\n", object_name);
2597
2598 /* Response will be an encoded string, which includes a length */
2599
2600 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2601 response = kzalloc(size, GFP_NOIO);
2602 if (!response) {
2603 ret = -ENOMEM;
2604 goto out;
2605 }
2606
2607 ret = rbd_req_sync_exec(rbd_dev, object_name,
2608 "rbd", "get_id",
2609 NULL, 0,
2610 response, RBD_IMAGE_ID_LEN_MAX,
2611 CEPH_OSD_FLAG_READ, NULL);
2612 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2613 if (ret < 0)
2614 goto out;
2615
2616 p = response;
2617 rbd_dev->image_id = ceph_extract_encoded_string(&p,
2618 p + RBD_IMAGE_ID_LEN_MAX,
2619 &rbd_dev->image_id_len,
2620 GFP_NOIO);
2621 if (IS_ERR(rbd_dev->image_id)) {
2622 ret = PTR_ERR(rbd_dev->image_id);
2623 rbd_dev->image_id = NULL;
2624 } else {
2625 dout("image_id is %s\n", rbd_dev->image_id);
2626 }
2627out:
2628 kfree(response);
2629 kfree(object_name);
2630
2631 return ret;
2632}
2633
a30b71b9
AE
2634static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
2635{
2636 int ret;
2637 size_t size;
2638
2639 /* Version 1 images have no id; empty string is used */
2640
2641 rbd_dev->image_id = kstrdup("", GFP_KERNEL);
2642 if (!rbd_dev->image_id)
2643 return -ENOMEM;
2644 rbd_dev->image_id_len = 0;
2645
2646 /* Record the header object name for this rbd image. */
2647
2648 size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX);
2649 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2650 if (!rbd_dev->header_name) {
2651 ret = -ENOMEM;
2652 goto out_err;
2653 }
2654 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
2655
2656 /* Populate rbd image metadata */
2657
2658 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
2659 if (ret < 0)
2660 goto out_err;
2661 rbd_dev->image_format = 1;
2662
2663 dout("discovered version 1 image, header name is %s\n",
2664 rbd_dev->header_name);
2665
2666 return 0;
2667
2668out_err:
2669 kfree(rbd_dev->header_name);
2670 rbd_dev->header_name = NULL;
2671 kfree(rbd_dev->image_id);
2672 rbd_dev->image_id = NULL;
2673
2674 return ret;
2675}
2676
2677static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
2678{
2679 size_t size;
9d475de5 2680 int ret;
a30b71b9
AE
2681
2682 /*
2683 * Image id was filled in by the caller. Record the header
2684 * object name for this rbd image.
2685 */
2686 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len;
2687 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2688 if (!rbd_dev->header_name)
2689 return -ENOMEM;
2690 sprintf(rbd_dev->header_name, "%s%s",
2691 RBD_HEADER_PREFIX, rbd_dev->image_id);
9d475de5
AE
2692
2693 /* Get the size and object order for the image */
2694
2695 ret = rbd_dev_v2_image_size(rbd_dev);
2696 if (ret < 0)
2697 goto out_err;
a30b71b9
AE
2698 rbd_dev->image_format = 2;
2699
2700 dout("discovered version 2 image, header name is %s\n",
2701 rbd_dev->header_name);
2702
2703 return -ENOTSUPP;
9d475de5
AE
2704out_err:
2705 kfree(rbd_dev->header_name);
2706 rbd_dev->header_name = NULL;
2707
2708 return ret;
a30b71b9
AE
2709}
2710
2711/*
2712 * Probe for the existence of the header object for the given rbd
2713 * device. For format 2 images this includes determining the image
2714 * id.
2715 */
2716static int rbd_dev_probe(struct rbd_device *rbd_dev)
2717{
2718 int ret;
2719
2720 /*
2721 * Get the id from the image id object. If it's not a
2722 * format 2 image, we'll get ENOENT back, and we'll assume
2723 * it's a format 1 image.
2724 */
2725 ret = rbd_dev_image_id(rbd_dev);
2726 if (ret)
2727 ret = rbd_dev_v1_probe(rbd_dev);
2728 else
2729 ret = rbd_dev_v2_probe(rbd_dev);
2730 if (ret)
2731 dout("probe failed, returning %d\n", ret);
2732
2733 return ret;
2734}
2735
59c2be1e
YS
2736static ssize_t rbd_add(struct bus_type *bus,
2737 const char *buf,
2738 size_t count)
602adf40 2739{
cb8627c7
AE
2740 char *options;
2741 struct rbd_device *rbd_dev = NULL;
7ef3214a
AE
2742 const char *mon_addrs = NULL;
2743 size_t mon_addrs_size = 0;
27cc2594
AE
2744 struct ceph_osd_client *osdc;
2745 int rc = -ENOMEM;
3feeb894 2746 char *snap_name;
602adf40
YS
2747
2748 if (!try_module_get(THIS_MODULE))
2749 return -ENODEV;
2750
60571c7d 2751 options = kmalloc(count, GFP_KERNEL);
602adf40 2752 if (!options)
85ae8926 2753 goto err_out_mem;
cb8627c7
AE
2754 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2755 if (!rbd_dev)
85ae8926 2756 goto err_out_mem;
602adf40
YS
2757
2758 /* static rbd_device initialization */
2759 spin_lock_init(&rbd_dev->lock);
2760 INIT_LIST_HEAD(&rbd_dev->node);
dfc5606d 2761 INIT_LIST_HEAD(&rbd_dev->snaps);
c666601a 2762 init_rwsem(&rbd_dev->header_rwsem);
602adf40 2763
602adf40 2764 /* parse add command */
3feeb894
AE
2765 snap_name = rbd_add_parse_args(rbd_dev, buf,
2766 &mon_addrs, &mon_addrs_size, options, count);
2767 if (IS_ERR(snap_name)) {
2768 rc = PTR_ERR(snap_name);
85ae8926 2769 goto err_out_mem;
3feeb894 2770 }
e124a82f 2771
f8c38929
AE
2772 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
2773 if (rc < 0)
85ae8926 2774 goto err_out_args;
602adf40 2775
602adf40 2776 /* pick the pool */
1dbb4399 2777 osdc = &rbd_dev->rbd_client->client->osdc;
602adf40
YS
2778 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2779 if (rc < 0)
2780 goto err_out_client;
9bb2f334 2781 rbd_dev->pool_id = rc;
602adf40 2782
a30b71b9
AE
2783 rc = rbd_dev_probe(rbd_dev);
2784 if (rc < 0)
05fd6f6f 2785 goto err_out_client;
a30b71b9 2786 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
05fd6f6f
AE
2787
2788 /* no need to lock here, as rbd_dev is not registered yet */
2789 rc = rbd_dev_snaps_update(rbd_dev);
2790 if (rc)
2791 goto err_out_header;
2792
2793 rc = rbd_dev_set_mapping(rbd_dev, snap_name);
2794 if (rc)
2795 goto err_out_header;
2796
85ae8926
AE
2797 /* generate unique id: find highest unique id, add one */
2798 rbd_dev_id_get(rbd_dev);
2799
2800 /* Fill in the device name, now that we have its id. */
2801 BUILD_BUG_ON(DEV_NAME_LEN
2802 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2803 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
2804
2805 /* Get our block major device number. */
2806
27cc2594
AE
2807 rc = register_blkdev(0, rbd_dev->name);
2808 if (rc < 0)
85ae8926 2809 goto err_out_id;
27cc2594 2810 rbd_dev->major = rc;
602adf40 2811
0f308a31
AE
2812 /* Set up the blkdev mapping. */
2813
2814 rc = rbd_init_disk(rbd_dev);
dfc5606d 2815 if (rc)
766fc439
YS
2816 goto err_out_blkdev;
2817
0f308a31
AE
2818 rc = rbd_bus_add_dev(rbd_dev);
2819 if (rc)
2820 goto err_out_disk;
2821
32eec68d
AE
2822 /*
2823 * At this point cleanup in the event of an error is the job
2824 * of the sysfs code (initiated by rbd_bus_del_dev()).
32eec68d 2825 */
2ac4e75d 2826
4bb1f1ed 2827 down_write(&rbd_dev->header_rwsem);
5ed16177 2828 rc = rbd_dev_snaps_register(rbd_dev);
4bb1f1ed 2829 up_write(&rbd_dev->header_rwsem);
2ac4e75d
AE
2830 if (rc)
2831 goto err_out_bus;
2832
3ee4001e
AE
2833 rc = rbd_init_watch_dev(rbd_dev);
2834 if (rc)
2835 goto err_out_bus;
2836
2ac4e75d
AE
2837 /* Everything's ready. Announce the disk to the world. */
2838
2ac4e75d 2839 add_disk(rbd_dev->disk);
3ee4001e 2840
2ac4e75d
AE
2841 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
2842 (unsigned long long) rbd_dev->mapping.size);
2843
602adf40
YS
2844 return count;
2845
766fc439 2846err_out_bus:
766fc439
YS
2847 /* this will also clean up rest of rbd_dev stuff */
2848
2849 rbd_bus_del_dev(rbd_dev);
2850 kfree(options);
766fc439
YS
2851 return rc;
2852
0f308a31
AE
2853err_out_disk:
2854 rbd_free_disk(rbd_dev);
602adf40
YS
2855err_out_blkdev:
2856 unregister_blkdev(rbd_dev->major, rbd_dev->name);
85ae8926
AE
2857err_out_id:
2858 rbd_dev_id_put(rbd_dev);
05fd6f6f
AE
2859err_out_header:
2860 rbd_header_free(&rbd_dev->header);
602adf40 2861err_out_client:
3fcf2581 2862 kfree(rbd_dev->header_name);
602adf40 2863 rbd_put_client(rbd_dev);
589d30e0 2864 kfree(rbd_dev->image_id);
85ae8926
AE
2865err_out_args:
2866 kfree(rbd_dev->mapping.snap_name);
2867 kfree(rbd_dev->image_name);
2868 kfree(rbd_dev->pool_name);
2869err_out_mem:
27cc2594 2870 kfree(rbd_dev);
cb8627c7 2871 kfree(options);
27cc2594 2872
602adf40
YS
2873 dout("Error adding device %s\n", buf);
2874 module_put(THIS_MODULE);
27cc2594
AE
2875
2876 return (ssize_t) rc;
602adf40
YS
2877}
2878
de71a297 2879static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
602adf40
YS
2880{
2881 struct list_head *tmp;
2882 struct rbd_device *rbd_dev;
2883
e124a82f 2884 spin_lock(&rbd_dev_list_lock);
602adf40
YS
2885 list_for_each(tmp, &rbd_dev_list) {
2886 rbd_dev = list_entry(tmp, struct rbd_device, node);
de71a297 2887 if (rbd_dev->dev_id == dev_id) {
e124a82f 2888 spin_unlock(&rbd_dev_list_lock);
602adf40 2889 return rbd_dev;
e124a82f 2890 }
602adf40 2891 }
e124a82f 2892 spin_unlock(&rbd_dev_list_lock);
602adf40
YS
2893 return NULL;
2894}
2895
dfc5606d 2896static void rbd_dev_release(struct device *dev)
602adf40 2897{
593a9e7b 2898 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 2899
1dbb4399
AE
2900 if (rbd_dev->watch_request) {
2901 struct ceph_client *client = rbd_dev->rbd_client->client;
2902
2903 ceph_osdc_unregister_linger_request(&client->osdc,
59c2be1e 2904 rbd_dev->watch_request);
1dbb4399 2905 }
59c2be1e 2906 if (rbd_dev->watch_event)
070c633f 2907 rbd_req_sync_unwatch(rbd_dev);
59c2be1e 2908
602adf40
YS
2909 rbd_put_client(rbd_dev);
2910
2911 /* clean up and free blkdev */
2912 rbd_free_disk(rbd_dev);
2913 unregister_blkdev(rbd_dev->major, rbd_dev->name);
32eec68d 2914
2ac4e75d
AE
2915 /* release allocated disk header fields */
2916 rbd_header_free(&rbd_dev->header);
2917
32eec68d 2918 /* done with the id, and with the rbd_dev */
f84344f3 2919 kfree(rbd_dev->mapping.snap_name);
589d30e0 2920 kfree(rbd_dev->image_id);
0bed54dc 2921 kfree(rbd_dev->header_name);
d22f76e7 2922 kfree(rbd_dev->pool_name);
0bed54dc 2923 kfree(rbd_dev->image_name);
e2839308 2924 rbd_dev_id_put(rbd_dev);
602adf40
YS
2925 kfree(rbd_dev);
2926
2927 /* release module ref */
2928 module_put(THIS_MODULE);
602adf40
YS
2929}
2930
dfc5606d
YS
2931static ssize_t rbd_remove(struct bus_type *bus,
2932 const char *buf,
2933 size_t count)
602adf40
YS
2934{
2935 struct rbd_device *rbd_dev = NULL;
2936 int target_id, rc;
2937 unsigned long ul;
2938 int ret = count;
2939
2940 rc = strict_strtoul(buf, 10, &ul);
2941 if (rc)
2942 return rc;
2943
2944 /* convert to int; abort if we lost anything in the conversion */
2945 target_id = (int) ul;
2946 if (target_id != ul)
2947 return -EINVAL;
2948
2949 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2950
2951 rbd_dev = __rbd_get_dev(target_id);
2952 if (!rbd_dev) {
2953 ret = -ENOENT;
2954 goto done;
2955 }
2956
dfc5606d
YS
2957 __rbd_remove_all_snaps(rbd_dev);
2958 rbd_bus_del_dev(rbd_dev);
602adf40
YS
2959
2960done:
2961 mutex_unlock(&ctl_mutex);
aafb230e 2962
602adf40
YS
2963 return ret;
2964}
2965
602adf40
YS
2966/*
2967 * create control files in sysfs
dfc5606d 2968 * /sys/bus/rbd/...
602adf40
YS
2969 */
2970static int rbd_sysfs_init(void)
2971{
dfc5606d 2972 int ret;
602adf40 2973
fed4c143 2974 ret = device_register(&rbd_root_dev);
21079786 2975 if (ret < 0)
dfc5606d 2976 return ret;
602adf40 2977
fed4c143
AE
2978 ret = bus_register(&rbd_bus_type);
2979 if (ret < 0)
2980 device_unregister(&rbd_root_dev);
602adf40 2981
602adf40
YS
2982 return ret;
2983}
2984
2985static void rbd_sysfs_cleanup(void)
2986{
dfc5606d 2987 bus_unregister(&rbd_bus_type);
fed4c143 2988 device_unregister(&rbd_root_dev);
602adf40
YS
2989}
2990
2991int __init rbd_init(void)
2992{
2993 int rc;
2994
2995 rc = rbd_sysfs_init();
2996 if (rc)
2997 return rc;
f0f8cef5 2998 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
602adf40
YS
2999 return 0;
3000}
3001
3002void __exit rbd_exit(void)
3003{
3004 rbd_sysfs_cleanup();
3005}
3006
3007module_init(rbd_init);
3008module_exit(rbd_exit);
3009
3010MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3011MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3012MODULE_DESCRIPTION("rados block device");
3013
3014/* following authorship retained from original osdblk.c */
3015MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3016
3017MODULE_LICENSE("GPL");