]> git.ipfire.org Git - people/arne_f/kernel.git/blame - drivers/block/rbd.c
rbd: encapsulate code that gets snapshot info
[people/arne_f/kernel.git] / drivers / block / rbd.c
CommitLineData
602adf40
YS
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
dfc5606d 24 For usage instructions, please refer to:
602adf40 25
dfc5606d 26 Documentation/ABI/testing/sysfs-bus-rbd
602adf40
YS
27
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
59c2be1e 34#include <linux/parser.h>
602adf40
YS
35
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
aafb230e
AE
44#define RBD_DEBUG /* Activate rbd_assert() calls */
45
593a9e7b
AE
46/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
df111be6
AE
55/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
f0f8cef5
AE
59#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
602adf40
YS
61
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
602adf40
YS
64#define RBD_MAX_SNAP_NAME_LEN 32
65#define RBD_MAX_OPT_LEN 1024
66
67#define RBD_SNAP_HEAD_NAME "-"
68
589d30e0
AE
69#define RBD_IMAGE_ID_LEN_MAX 64
70
81a89793
AE
71/*
72 * An RBD device name will be "rbd#", where the "rbd" comes from
73 * RBD_DRV_NAME above, and # is a unique integer identifier.
74 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
75 * enough to hold all possible device names.
76 */
602adf40 77#define DEV_NAME_LEN 32
81a89793 78#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
602adf40 79
cc0538b6 80#define RBD_READ_ONLY_DEFAULT false
59c2be1e 81
602adf40
YS
82/*
83 * block device image metadata (in-memory version)
84 */
85struct rbd_image_header {
f84344f3 86 /* These four fields never change for a given rbd image */
849b4260 87 char *object_prefix;
34b13184 88 u64 features;
602adf40
YS
89 __u8 obj_order;
90 __u8 crypt_type;
91 __u8 comp_type;
602adf40 92
f84344f3
AE
93 /* The remaining fields need to be updated occasionally */
94 u64 image_size;
95 struct ceph_snap_context *snapc;
602adf40
YS
96 char *snap_names;
97 u64 *snap_sizes;
59c2be1e
YS
98
99 u64 obj_version;
100};
101
102struct rbd_options {
cc0538b6 103 bool read_only;
602adf40
YS
104};
105
106/*
f0f8cef5 107 * an instance of the client. multiple devices may share an rbd client.
602adf40
YS
108 */
109struct rbd_client {
110 struct ceph_client *client;
111 struct kref kref;
112 struct list_head node;
113};
114
115/*
f0f8cef5 116 * a request completion status
602adf40 117 */
1fec7093
YS
118struct rbd_req_status {
119 int done;
120 int rc;
121 u64 bytes;
122};
123
124/*
125 * a collection of requests
126 */
127struct rbd_req_coll {
128 int total;
129 int num_done;
130 struct kref kref;
131 struct rbd_req_status status[0];
602adf40
YS
132};
133
f0f8cef5
AE
134/*
135 * a single io request
136 */
137struct rbd_request {
138 struct request *rq; /* blk layer request */
139 struct bio *bio; /* cloned bio */
140 struct page **pages; /* list of used pages */
141 u64 len;
142 int coll_index;
143 struct rbd_req_coll *coll;
144};
145
dfc5606d
YS
146struct rbd_snap {
147 struct device dev;
148 const char *name;
3591538f 149 u64 size;
dfc5606d
YS
150 struct list_head node;
151 u64 id;
34b13184 152 u64 features;
dfc5606d
YS
153};
154
f84344f3
AE
155struct rbd_mapping {
156 char *snap_name;
157 u64 snap_id;
99c1f08f 158 u64 size;
34b13184 159 u64 features;
f84344f3
AE
160 bool snap_exists;
161 bool read_only;
162};
163
602adf40
YS
164/*
165 * a single device
166 */
167struct rbd_device {
de71a297 168 int dev_id; /* blkdev unique id */
602adf40
YS
169
170 int major; /* blkdev assigned major */
171 struct gendisk *disk; /* blkdev's gendisk and rq */
602adf40 172
f8c38929 173 struct rbd_options rbd_opts;
602adf40
YS
174 struct rbd_client *rbd_client;
175
176 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
177
178 spinlock_t lock; /* queue lock */
179
180 struct rbd_image_header header;
589d30e0
AE
181 char *image_id;
182 size_t image_id_len;
0bed54dc
AE
183 char *image_name;
184 size_t image_name_len;
185 char *header_name;
d22f76e7 186 char *pool_name;
9bb2f334 187 int pool_id;
602adf40 188
59c2be1e
YS
189 struct ceph_osd_event *watch_event;
190 struct ceph_osd_request *watch_request;
191
c666601a
JD
192 /* protects updating the header */
193 struct rw_semaphore header_rwsem;
f84344f3
AE
194
195 struct rbd_mapping mapping;
602adf40
YS
196
197 struct list_head node;
dfc5606d
YS
198
199 /* list of snapshots */
200 struct list_head snaps;
201
202 /* sysfs related */
203 struct device dev;
204};
205
602adf40 206static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
e124a82f 207
602adf40 208static LIST_HEAD(rbd_dev_list); /* devices */
e124a82f
AE
209static DEFINE_SPINLOCK(rbd_dev_list_lock);
210
432b8587
AE
211static LIST_HEAD(rbd_client_list); /* clients */
212static DEFINE_SPINLOCK(rbd_client_list_lock);
602adf40 213
304f6808
AE
214static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
215static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
216
dfc5606d 217static void rbd_dev_release(struct device *dev);
14e7085d 218static void __rbd_remove_snap_dev(struct rbd_snap *snap);
dfc5606d 219
f0f8cef5
AE
220static ssize_t rbd_add(struct bus_type *bus, const char *buf,
221 size_t count);
222static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
223 size_t count);
224
225static struct bus_attribute rbd_bus_attrs[] = {
226 __ATTR(add, S_IWUSR, NULL, rbd_add),
227 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
228 __ATTR_NULL
229};
230
231static struct bus_type rbd_bus_type = {
232 .name = "rbd",
233 .bus_attrs = rbd_bus_attrs,
234};
235
236static void rbd_root_dev_release(struct device *dev)
237{
238}
239
240static struct device rbd_root_dev = {
241 .init_name = "rbd",
242 .release = rbd_root_dev_release,
243};
244
aafb230e
AE
245#ifdef RBD_DEBUG
246#define rbd_assert(expr) \
247 if (unlikely(!(expr))) { \
248 printk(KERN_ERR "\nAssertion failure in %s() " \
249 "at line %d:\n\n" \
250 "\trbd_assert(%s);\n\n", \
251 __func__, __LINE__, #expr); \
252 BUG(); \
253 }
254#else /* !RBD_DEBUG */
255# define rbd_assert(expr) ((void) 0)
256#endif /* !RBD_DEBUG */
dfc5606d 257
dfc5606d
YS
258static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
259{
260 return get_device(&rbd_dev->dev);
261}
262
263static void rbd_put_dev(struct rbd_device *rbd_dev)
264{
265 put_device(&rbd_dev->dev);
266}
602adf40 267
1fe5e993 268static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
59c2be1e 269
602adf40
YS
270static int rbd_open(struct block_device *bdev, fmode_t mode)
271{
f0f8cef5 272 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
602adf40 273
f84344f3 274 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
602adf40
YS
275 return -EROFS;
276
340c7a2b 277 rbd_get_dev(rbd_dev);
f84344f3 278 set_device_ro(bdev, rbd_dev->mapping.read_only);
340c7a2b 279
602adf40
YS
280 return 0;
281}
282
dfc5606d
YS
283static int rbd_release(struct gendisk *disk, fmode_t mode)
284{
285 struct rbd_device *rbd_dev = disk->private_data;
286
287 rbd_put_dev(rbd_dev);
288
289 return 0;
290}
291
602adf40
YS
292static const struct block_device_operations rbd_bd_ops = {
293 .owner = THIS_MODULE,
294 .open = rbd_open,
dfc5606d 295 .release = rbd_release,
602adf40
YS
296};
297
298/*
299 * Initialize an rbd client instance.
43ae4701 300 * We own *ceph_opts.
602adf40 301 */
f8c38929 302static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
602adf40
YS
303{
304 struct rbd_client *rbdc;
305 int ret = -ENOMEM;
306
307 dout("rbd_client_create\n");
308 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
309 if (!rbdc)
310 goto out_opt;
311
312 kref_init(&rbdc->kref);
313 INIT_LIST_HEAD(&rbdc->node);
314
bc534d86
AE
315 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
316
43ae4701 317 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
602adf40 318 if (IS_ERR(rbdc->client))
bc534d86 319 goto out_mutex;
43ae4701 320 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
602adf40
YS
321
322 ret = ceph_open_session(rbdc->client);
323 if (ret < 0)
324 goto out_err;
325
432b8587 326 spin_lock(&rbd_client_list_lock);
602adf40 327 list_add_tail(&rbdc->node, &rbd_client_list);
432b8587 328 spin_unlock(&rbd_client_list_lock);
602adf40 329
bc534d86
AE
330 mutex_unlock(&ctl_mutex);
331
602adf40
YS
332 dout("rbd_client_create created %p\n", rbdc);
333 return rbdc;
334
335out_err:
336 ceph_destroy_client(rbdc->client);
bc534d86
AE
337out_mutex:
338 mutex_unlock(&ctl_mutex);
602adf40
YS
339 kfree(rbdc);
340out_opt:
43ae4701
AE
341 if (ceph_opts)
342 ceph_destroy_options(ceph_opts);
28f259b7 343 return ERR_PTR(ret);
602adf40
YS
344}
345
346/*
1f7ba331
AE
347 * Find a ceph client with specific addr and configuration. If
348 * found, bump its reference count.
602adf40 349 */
1f7ba331 350static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
602adf40
YS
351{
352 struct rbd_client *client_node;
1f7ba331 353 bool found = false;
602adf40 354
43ae4701 355 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
602adf40
YS
356 return NULL;
357
1f7ba331
AE
358 spin_lock(&rbd_client_list_lock);
359 list_for_each_entry(client_node, &rbd_client_list, node) {
360 if (!ceph_compare_options(ceph_opts, client_node->client)) {
361 kref_get(&client_node->kref);
362 found = true;
363 break;
364 }
365 }
366 spin_unlock(&rbd_client_list_lock);
367
368 return found ? client_node : NULL;
602adf40
YS
369}
370
59c2be1e
YS
371/*
372 * mount options
373 */
374enum {
59c2be1e
YS
375 Opt_last_int,
376 /* int args above */
377 Opt_last_string,
378 /* string args above */
cc0538b6
AE
379 Opt_read_only,
380 Opt_read_write,
381 /* Boolean args above */
382 Opt_last_bool,
59c2be1e
YS
383};
384
43ae4701 385static match_table_t rbd_opts_tokens = {
59c2be1e
YS
386 /* int args above */
387 /* string args above */
f84344f3 388 {Opt_read_only, "mapping.read_only"},
cc0538b6
AE
389 {Opt_read_only, "ro"}, /* Alternate spelling */
390 {Opt_read_write, "read_write"},
391 {Opt_read_write, "rw"}, /* Alternate spelling */
392 /* Boolean args above */
59c2be1e
YS
393 {-1, NULL}
394};
395
396static int parse_rbd_opts_token(char *c, void *private)
397{
43ae4701 398 struct rbd_options *rbd_opts = private;
59c2be1e
YS
399 substring_t argstr[MAX_OPT_ARGS];
400 int token, intval, ret;
401
43ae4701 402 token = match_token(c, rbd_opts_tokens, argstr);
59c2be1e
YS
403 if (token < 0)
404 return -EINVAL;
405
406 if (token < Opt_last_int) {
407 ret = match_int(&argstr[0], &intval);
408 if (ret < 0) {
409 pr_err("bad mount option arg (not int) "
410 "at '%s'\n", c);
411 return ret;
412 }
413 dout("got int token %d val %d\n", token, intval);
414 } else if (token > Opt_last_int && token < Opt_last_string) {
415 dout("got string token %d val %s\n", token,
416 argstr[0].from);
cc0538b6
AE
417 } else if (token > Opt_last_string && token < Opt_last_bool) {
418 dout("got Boolean token %d\n", token);
59c2be1e
YS
419 } else {
420 dout("got token %d\n", token);
421 }
422
423 switch (token) {
cc0538b6
AE
424 case Opt_read_only:
425 rbd_opts->read_only = true;
426 break;
427 case Opt_read_write:
428 rbd_opts->read_only = false;
429 break;
59c2be1e 430 default:
aafb230e
AE
431 rbd_assert(false);
432 break;
59c2be1e
YS
433 }
434 return 0;
435}
436
602adf40
YS
437/*
438 * Get a ceph client with specific addr and configuration, if one does
439 * not exist create it.
440 */
f8c38929
AE
441static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
442 size_t mon_addr_len, char *options)
602adf40 443{
f8c38929 444 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
43ae4701 445 struct ceph_options *ceph_opts;
f8c38929 446 struct rbd_client *rbdc;
59c2be1e 447
cc0538b6 448 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
602adf40 449
43ae4701
AE
450 ceph_opts = ceph_parse_options(options, mon_addr,
451 mon_addr + mon_addr_len,
452 parse_rbd_opts_token, rbd_opts);
f8c38929
AE
453 if (IS_ERR(ceph_opts))
454 return PTR_ERR(ceph_opts);
602adf40 455
1f7ba331 456 rbdc = rbd_client_find(ceph_opts);
602adf40 457 if (rbdc) {
602adf40 458 /* using an existing client */
43ae4701 459 ceph_destroy_options(ceph_opts);
f8c38929
AE
460 } else {
461 rbdc = rbd_client_create(ceph_opts);
462 if (IS_ERR(rbdc))
463 return PTR_ERR(rbdc);
602adf40 464 }
f8c38929 465 rbd_dev->rbd_client = rbdc;
602adf40 466
f8c38929 467 return 0;
602adf40
YS
468}
469
470/*
471 * Destroy ceph client
d23a4b3f 472 *
432b8587 473 * Caller must hold rbd_client_list_lock.
602adf40
YS
474 */
475static void rbd_client_release(struct kref *kref)
476{
477 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
478
479 dout("rbd_release_client %p\n", rbdc);
cd9d9f5d 480 spin_lock(&rbd_client_list_lock);
602adf40 481 list_del(&rbdc->node);
cd9d9f5d 482 spin_unlock(&rbd_client_list_lock);
602adf40
YS
483
484 ceph_destroy_client(rbdc->client);
485 kfree(rbdc);
486}
487
488/*
489 * Drop reference to ceph client node. If it's not referenced anymore, release
490 * it.
491 */
492static void rbd_put_client(struct rbd_device *rbd_dev)
493{
494 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
495 rbd_dev->rbd_client = NULL;
602adf40
YS
496}
497
1fec7093
YS
498/*
499 * Destroy requests collection
500 */
501static void rbd_coll_release(struct kref *kref)
502{
503 struct rbd_req_coll *coll =
504 container_of(kref, struct rbd_req_coll, kref);
505
506 dout("rbd_coll_release %p\n", coll);
507 kfree(coll);
508}
602adf40 509
8e94af8e
AE
510static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
511{
103a150f
AE
512 size_t size;
513 u32 snap_count;
514
515 /* The header has to start with the magic rbd header text */
516 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
517 return false;
518
519 /*
520 * The size of a snapshot header has to fit in a size_t, and
521 * that limits the number of snapshots.
522 */
523 snap_count = le32_to_cpu(ondisk->snap_count);
524 size = SIZE_MAX - sizeof (struct ceph_snap_context);
525 if (snap_count > size / sizeof (__le64))
526 return false;
527
528 /*
529 * Not only that, but the size of the entire the snapshot
530 * header must also be representable in a size_t.
531 */
532 size -= snap_count * sizeof (__le64);
533 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
534 return false;
535
536 return true;
8e94af8e
AE
537}
538
602adf40
YS
539/*
540 * Create a new header structure, translate header format from the on-disk
541 * header.
542 */
543static int rbd_header_from_disk(struct rbd_image_header *header,
4156d998 544 struct rbd_image_header_ondisk *ondisk)
602adf40 545{
ccece235 546 u32 snap_count;
58c17b0e 547 size_t len;
d2bb24e5 548 size_t size;
621901d6 549 u32 i;
602adf40 550
6a52325f
AE
551 memset(header, 0, sizeof (*header));
552
103a150f
AE
553 snap_count = le32_to_cpu(ondisk->snap_count);
554
58c17b0e
AE
555 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
556 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
6a52325f 557 if (!header->object_prefix)
602adf40 558 return -ENOMEM;
58c17b0e
AE
559 memcpy(header->object_prefix, ondisk->object_prefix, len);
560 header->object_prefix[len] = '\0';
00f1f36f 561
602adf40 562 if (snap_count) {
f785cc1d
AE
563 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
564
621901d6
AE
565 /* Save a copy of the snapshot names */
566
f785cc1d
AE
567 if (snap_names_len > (u64) SIZE_MAX)
568 return -EIO;
569 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
602adf40 570 if (!header->snap_names)
6a52325f 571 goto out_err;
f785cc1d
AE
572 /*
573 * Note that rbd_dev_v1_header_read() guarantees
574 * the ondisk buffer we're working with has
575 * snap_names_len bytes beyond the end of the
576 * snapshot id array, this memcpy() is safe.
577 */
578 memcpy(header->snap_names, &ondisk->snaps[snap_count],
579 snap_names_len);
6a52325f 580
621901d6
AE
581 /* Record each snapshot's size */
582
d2bb24e5
AE
583 size = snap_count * sizeof (*header->snap_sizes);
584 header->snap_sizes = kmalloc(size, GFP_KERNEL);
602adf40 585 if (!header->snap_sizes)
6a52325f 586 goto out_err;
621901d6
AE
587 for (i = 0; i < snap_count; i++)
588 header->snap_sizes[i] =
589 le64_to_cpu(ondisk->snaps[i].image_size);
602adf40 590 } else {
ccece235 591 WARN_ON(ondisk->snap_names_len);
602adf40
YS
592 header->snap_names = NULL;
593 header->snap_sizes = NULL;
594 }
849b4260 595
34b13184 596 header->features = 0; /* No features support in v1 images */
602adf40
YS
597 header->obj_order = ondisk->options.order;
598 header->crypt_type = ondisk->options.crypt_type;
599 header->comp_type = ondisk->options.comp_type;
6a52325f 600
621901d6
AE
601 /* Allocate and fill in the snapshot context */
602
f84344f3 603 header->image_size = le64_to_cpu(ondisk->image_size);
6a52325f
AE
604 size = sizeof (struct ceph_snap_context);
605 size += snap_count * sizeof (header->snapc->snaps[0]);
606 header->snapc = kzalloc(size, GFP_KERNEL);
607 if (!header->snapc)
608 goto out_err;
602adf40
YS
609
610 atomic_set(&header->snapc->nref, 1);
505cbb9b 611 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
602adf40 612 header->snapc->num_snaps = snap_count;
621901d6
AE
613 for (i = 0; i < snap_count; i++)
614 header->snapc->snaps[i] =
615 le64_to_cpu(ondisk->snaps[i].id);
602adf40
YS
616
617 return 0;
618
6a52325f 619out_err:
849b4260 620 kfree(header->snap_sizes);
ccece235 621 header->snap_sizes = NULL;
602adf40 622 kfree(header->snap_names);
ccece235 623 header->snap_names = NULL;
6a52325f
AE
624 kfree(header->object_prefix);
625 header->object_prefix = NULL;
ccece235 626
00f1f36f 627 return -ENOMEM;
602adf40
YS
628}
629
8836b995 630static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
602adf40 631{
602adf40 632
e86924a8 633 struct rbd_snap *snap;
602adf40 634
e86924a8
AE
635 list_for_each_entry(snap, &rbd_dev->snaps, node) {
636 if (!strcmp(snap_name, snap->name)) {
637 rbd_dev->mapping.snap_id = snap->id;
638 rbd_dev->mapping.size = snap->size;
34b13184 639 rbd_dev->mapping.features = snap->features;
602adf40 640
e86924a8 641 return 0;
00f1f36f 642 }
00f1f36f 643 }
e86924a8 644
00f1f36f 645 return -ENOENT;
602adf40
YS
646}
647
5ed16177 648static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
602adf40 649{
78dc447d 650 int ret;
602adf40 651
4e1105a2 652 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
cc9d734c 653 sizeof (RBD_SNAP_HEAD_NAME))) {
f84344f3 654 rbd_dev->mapping.snap_id = CEPH_NOSNAP;
99c1f08f 655 rbd_dev->mapping.size = rbd_dev->header.image_size;
34b13184 656 rbd_dev->mapping.features = rbd_dev->header.features;
f84344f3
AE
657 rbd_dev->mapping.snap_exists = false;
658 rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
e86924a8 659 ret = 0;
602adf40 660 } else {
8836b995 661 ret = snap_by_name(rbd_dev, snap_name);
602adf40
YS
662 if (ret < 0)
663 goto done;
f84344f3
AE
664 rbd_dev->mapping.snap_exists = true;
665 rbd_dev->mapping.read_only = true;
602adf40 666 }
4e1105a2 667 rbd_dev->mapping.snap_name = snap_name;
602adf40 668done:
602adf40
YS
669 return ret;
670}
671
672static void rbd_header_free(struct rbd_image_header *header)
673{
849b4260 674 kfree(header->object_prefix);
d78fd7ae 675 header->object_prefix = NULL;
602adf40 676 kfree(header->snap_sizes);
d78fd7ae 677 header->snap_sizes = NULL;
849b4260 678 kfree(header->snap_names);
d78fd7ae 679 header->snap_names = NULL;
d1d25646 680 ceph_put_snap_context(header->snapc);
d78fd7ae 681 header->snapc = NULL;
602adf40
YS
682}
683
65ccfe21 684static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
602adf40 685{
65ccfe21
AE
686 char *name;
687 u64 segment;
688 int ret;
602adf40 689
65ccfe21
AE
690 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
691 if (!name)
692 return NULL;
693 segment = offset >> rbd_dev->header.obj_order;
694 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
695 rbd_dev->header.object_prefix, segment);
696 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
697 pr_err("error formatting segment name for #%llu (%d)\n",
698 segment, ret);
699 kfree(name);
700 name = NULL;
701 }
602adf40 702
65ccfe21
AE
703 return name;
704}
602adf40 705
65ccfe21
AE
706static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
707{
708 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
602adf40 709
65ccfe21
AE
710 return offset & (segment_size - 1);
711}
712
713static u64 rbd_segment_length(struct rbd_device *rbd_dev,
714 u64 offset, u64 length)
715{
716 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
717
718 offset &= segment_size - 1;
719
aafb230e 720 rbd_assert(length <= U64_MAX - offset);
65ccfe21
AE
721 if (offset + length > segment_size)
722 length = segment_size - offset;
723
724 return length;
602adf40
YS
725}
726
1fec7093
YS
727static int rbd_get_num_segments(struct rbd_image_header *header,
728 u64 ofs, u64 len)
729{
df111be6
AE
730 u64 start_seg;
731 u64 end_seg;
732
733 if (!len)
734 return 0;
735 if (len - 1 > U64_MAX - ofs)
736 return -ERANGE;
737
738 start_seg = ofs >> header->obj_order;
739 end_seg = (ofs + len - 1) >> header->obj_order;
740
1fec7093
YS
741 return end_seg - start_seg + 1;
742}
743
029bcbd8
JD
744/*
745 * returns the size of an object in the image
746 */
747static u64 rbd_obj_bytes(struct rbd_image_header *header)
748{
749 return 1 << header->obj_order;
750}
751
602adf40
YS
752/*
753 * bio helpers
754 */
755
756static void bio_chain_put(struct bio *chain)
757{
758 struct bio *tmp;
759
760 while (chain) {
761 tmp = chain;
762 chain = chain->bi_next;
763 bio_put(tmp);
764 }
765}
766
767/*
768 * zeros a bio chain, starting at specific offset
769 */
770static void zero_bio_chain(struct bio *chain, int start_ofs)
771{
772 struct bio_vec *bv;
773 unsigned long flags;
774 void *buf;
775 int i;
776 int pos = 0;
777
778 while (chain) {
779 bio_for_each_segment(bv, chain, i) {
780 if (pos + bv->bv_len > start_ofs) {
781 int remainder = max(start_ofs - pos, 0);
782 buf = bvec_kmap_irq(bv, &flags);
783 memset(buf + remainder, 0,
784 bv->bv_len - remainder);
85b5aaa6 785 bvec_kunmap_irq(buf, &flags);
602adf40
YS
786 }
787 pos += bv->bv_len;
788 }
789
790 chain = chain->bi_next;
791 }
792}
793
794/*
795 * bio_chain_clone - clone a chain of bios up to a certain length.
796 * might return a bio_pair that will need to be released.
797 */
798static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
799 struct bio_pair **bp,
800 int len, gfp_t gfpmask)
801{
542582fc
AE
802 struct bio *old_chain = *old;
803 struct bio *new_chain = NULL;
804 struct bio *tail;
602adf40
YS
805 int total = 0;
806
807 if (*bp) {
808 bio_pair_release(*bp);
809 *bp = NULL;
810 }
811
812 while (old_chain && (total < len)) {
542582fc
AE
813 struct bio *tmp;
814
602adf40
YS
815 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
816 if (!tmp)
817 goto err_out;
542582fc 818 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
602adf40
YS
819
820 if (total + old_chain->bi_size > len) {
821 struct bio_pair *bp;
822
823 /*
824 * this split can only happen with a single paged bio,
825 * split_bio will BUG_ON if this is not the case
826 */
827 dout("bio_chain_clone split! total=%d remaining=%d"
bd919d45
AE
828 "bi_size=%u\n",
829 total, len - total, old_chain->bi_size);
602adf40
YS
830
831 /* split the bio. We'll release it either in the next
832 call, or it will have to be released outside */
593a9e7b 833 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
602adf40
YS
834 if (!bp)
835 goto err_out;
836
837 __bio_clone(tmp, &bp->bio1);
838
839 *next = &bp->bio2;
840 } else {
841 __bio_clone(tmp, old_chain);
842 *next = old_chain->bi_next;
843 }
844
845 tmp->bi_bdev = NULL;
602adf40 846 tmp->bi_next = NULL;
542582fc 847 if (new_chain)
602adf40 848 tail->bi_next = tmp;
542582fc
AE
849 else
850 new_chain = tmp;
851 tail = tmp;
602adf40
YS
852 old_chain = old_chain->bi_next;
853
854 total += tmp->bi_size;
855 }
856
aafb230e 857 rbd_assert(total == len);
602adf40 858
602adf40
YS
859 *old = old_chain;
860
861 return new_chain;
862
863err_out:
864 dout("bio_chain_clone with err\n");
865 bio_chain_put(new_chain);
866 return NULL;
867}
868
869/*
870 * helpers for osd request op vectors.
871 */
57cfc106
AE
872static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
873 int opcode, u32 payload_len)
602adf40 874{
57cfc106
AE
875 struct ceph_osd_req_op *ops;
876
877 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
878 if (!ops)
879 return NULL;
880
881 ops[0].op = opcode;
882
602adf40
YS
883 /*
884 * op extent offset and length will be set later on
885 * in calc_raw_layout()
886 */
57cfc106
AE
887 ops[0].payload_len = payload_len;
888
889 return ops;
602adf40
YS
890}
891
892static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
893{
894 kfree(ops);
895}
896
1fec7093
YS
897static void rbd_coll_end_req_index(struct request *rq,
898 struct rbd_req_coll *coll,
899 int index,
900 int ret, u64 len)
901{
902 struct request_queue *q;
903 int min, max, i;
904
bd919d45
AE
905 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
906 coll, index, ret, (unsigned long long) len);
1fec7093
YS
907
908 if (!rq)
909 return;
910
911 if (!coll) {
912 blk_end_request(rq, ret, len);
913 return;
914 }
915
916 q = rq->q;
917
918 spin_lock_irq(q->queue_lock);
919 coll->status[index].done = 1;
920 coll->status[index].rc = ret;
921 coll->status[index].bytes = len;
922 max = min = coll->num_done;
923 while (max < coll->total && coll->status[max].done)
924 max++;
925
926 for (i = min; i<max; i++) {
927 __blk_end_request(rq, coll->status[i].rc,
928 coll->status[i].bytes);
929 coll->num_done++;
930 kref_put(&coll->kref, rbd_coll_release);
931 }
932 spin_unlock_irq(q->queue_lock);
933}
934
935static void rbd_coll_end_req(struct rbd_request *req,
936 int ret, u64 len)
937{
938 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
939}
940
602adf40
YS
941/*
942 * Send ceph osd request
943 */
944static int rbd_do_request(struct request *rq,
0ce1a794 945 struct rbd_device *rbd_dev,
602adf40
YS
946 struct ceph_snap_context *snapc,
947 u64 snapid,
aded07ea 948 const char *object_name, u64 ofs, u64 len,
602adf40
YS
949 struct bio *bio,
950 struct page **pages,
951 int num_pages,
952 int flags,
953 struct ceph_osd_req_op *ops,
1fec7093
YS
954 struct rbd_req_coll *coll,
955 int coll_index,
602adf40 956 void (*rbd_cb)(struct ceph_osd_request *req,
59c2be1e
YS
957 struct ceph_msg *msg),
958 struct ceph_osd_request **linger_req,
959 u64 *ver)
602adf40
YS
960{
961 struct ceph_osd_request *req;
962 struct ceph_file_layout *layout;
963 int ret;
964 u64 bno;
965 struct timespec mtime = CURRENT_TIME;
966 struct rbd_request *req_data;
967 struct ceph_osd_request_head *reqhead;
1dbb4399 968 struct ceph_osd_client *osdc;
602adf40 969
602adf40 970 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
1fec7093
YS
971 if (!req_data) {
972 if (coll)
973 rbd_coll_end_req_index(rq, coll, coll_index,
974 -ENOMEM, len);
975 return -ENOMEM;
976 }
977
978 if (coll) {
979 req_data->coll = coll;
980 req_data->coll_index = coll_index;
981 }
602adf40 982
bd919d45
AE
983 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
984 (unsigned long long) ofs, (unsigned long long) len);
602adf40 985
0ce1a794 986 osdc = &rbd_dev->rbd_client->client->osdc;
1dbb4399
AE
987 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
988 false, GFP_NOIO, pages, bio);
4ad12621 989 if (!req) {
4ad12621 990 ret = -ENOMEM;
602adf40
YS
991 goto done_pages;
992 }
993
994 req->r_callback = rbd_cb;
995
996 req_data->rq = rq;
997 req_data->bio = bio;
998 req_data->pages = pages;
999 req_data->len = len;
1000
1001 req->r_priv = req_data;
1002
1003 reqhead = req->r_request->front.iov_base;
1004 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1005
aded07ea 1006 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
602adf40
YS
1007 req->r_oid_len = strlen(req->r_oid);
1008
1009 layout = &req->r_file_layout;
1010 memset(layout, 0, sizeof(*layout));
1011 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1012 layout->fl_stripe_count = cpu_to_le32(1);
1013 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
0ce1a794 1014 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
1dbb4399
AE
1015 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1016 req, ops);
602adf40
YS
1017
1018 ceph_osdc_build_request(req, ofs, &len,
1019 ops,
1020 snapc,
1021 &mtime,
1022 req->r_oid, req->r_oid_len);
602adf40 1023
59c2be1e 1024 if (linger_req) {
1dbb4399 1025 ceph_osdc_set_request_linger(osdc, req);
59c2be1e
YS
1026 *linger_req = req;
1027 }
1028
1dbb4399 1029 ret = ceph_osdc_start_request(osdc, req, false);
602adf40
YS
1030 if (ret < 0)
1031 goto done_err;
1032
1033 if (!rbd_cb) {
1dbb4399 1034 ret = ceph_osdc_wait_request(osdc, req);
59c2be1e
YS
1035 if (ver)
1036 *ver = le64_to_cpu(req->r_reassert_version.version);
bd919d45
AE
1037 dout("reassert_ver=%llu\n",
1038 (unsigned long long)
1039 le64_to_cpu(req->r_reassert_version.version));
602adf40
YS
1040 ceph_osdc_put_request(req);
1041 }
1042 return ret;
1043
1044done_err:
1045 bio_chain_put(req_data->bio);
1046 ceph_osdc_put_request(req);
1047done_pages:
1fec7093 1048 rbd_coll_end_req(req_data, ret, len);
602adf40 1049 kfree(req_data);
602adf40
YS
1050 return ret;
1051}
1052
1053/*
1054 * Ceph osd op callback
1055 */
1056static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1057{
1058 struct rbd_request *req_data = req->r_priv;
1059 struct ceph_osd_reply_head *replyhead;
1060 struct ceph_osd_op *op;
1061 __s32 rc;
1062 u64 bytes;
1063 int read_op;
1064
1065 /* parse reply */
1066 replyhead = msg->front.iov_base;
1067 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1068 op = (void *)(replyhead + 1);
1069 rc = le32_to_cpu(replyhead->result);
1070 bytes = le64_to_cpu(op->extent.length);
895cfcc8 1071 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
602adf40 1072
bd919d45
AE
1073 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1074 (unsigned long long) bytes, read_op, (int) rc);
602adf40
YS
1075
1076 if (rc == -ENOENT && read_op) {
1077 zero_bio_chain(req_data->bio, 0);
1078 rc = 0;
1079 } else if (rc == 0 && read_op && bytes < req_data->len) {
1080 zero_bio_chain(req_data->bio, bytes);
1081 bytes = req_data->len;
1082 }
1083
1fec7093 1084 rbd_coll_end_req(req_data, rc, bytes);
602adf40
YS
1085
1086 if (req_data->bio)
1087 bio_chain_put(req_data->bio);
1088
1089 ceph_osdc_put_request(req);
1090 kfree(req_data);
1091}
1092
59c2be1e
YS
1093static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1094{
1095 ceph_osdc_put_request(req);
1096}
1097
602adf40
YS
1098/*
1099 * Do a synchronous ceph osd operation
1100 */
0ce1a794 1101static int rbd_req_sync_op(struct rbd_device *rbd_dev,
602adf40
YS
1102 struct ceph_snap_context *snapc,
1103 u64 snapid,
602adf40 1104 int flags,
913d2fdc 1105 struct ceph_osd_req_op *ops,
aded07ea 1106 const char *object_name,
f8d4de6e
AE
1107 u64 ofs, u64 inbound_size,
1108 char *inbound,
59c2be1e
YS
1109 struct ceph_osd_request **linger_req,
1110 u64 *ver)
602adf40
YS
1111{
1112 int ret;
1113 struct page **pages;
1114 int num_pages;
913d2fdc 1115
aafb230e 1116 rbd_assert(ops != NULL);
602adf40 1117
f8d4de6e 1118 num_pages = calc_pages_for(ofs, inbound_size);
602adf40 1119 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
b8d0638a
DC
1120 if (IS_ERR(pages))
1121 return PTR_ERR(pages);
602adf40 1122
0ce1a794 1123 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
f8d4de6e 1124 object_name, ofs, inbound_size, NULL,
602adf40
YS
1125 pages, num_pages,
1126 flags,
1127 ops,
1fec7093 1128 NULL, 0,
59c2be1e
YS
1129 NULL,
1130 linger_req, ver);
602adf40 1131 if (ret < 0)
913d2fdc 1132 goto done;
602adf40 1133
f8d4de6e
AE
1134 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1135 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
602adf40 1136
602adf40
YS
1137done:
1138 ceph_release_page_vector(pages, num_pages);
1139 return ret;
1140}
1141
1142/*
1143 * Do an asynchronous ceph osd operation
1144 */
1145static int rbd_do_op(struct request *rq,
0ce1a794 1146 struct rbd_device *rbd_dev,
602adf40
YS
1147 struct ceph_snap_context *snapc,
1148 u64 snapid,
d1f57ea6 1149 int opcode, int flags,
602adf40 1150 u64 ofs, u64 len,
1fec7093
YS
1151 struct bio *bio,
1152 struct rbd_req_coll *coll,
1153 int coll_index)
602adf40
YS
1154{
1155 char *seg_name;
1156 u64 seg_ofs;
1157 u64 seg_len;
1158 int ret;
1159 struct ceph_osd_req_op *ops;
1160 u32 payload_len;
1161
65ccfe21 1162 seg_name = rbd_segment_name(rbd_dev, ofs);
602adf40
YS
1163 if (!seg_name)
1164 return -ENOMEM;
65ccfe21
AE
1165 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1166 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
602adf40
YS
1167
1168 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1169
57cfc106
AE
1170 ret = -ENOMEM;
1171 ops = rbd_create_rw_ops(1, opcode, payload_len);
1172 if (!ops)
602adf40
YS
1173 goto done;
1174
1175 /* we've taken care of segment sizes earlier when we
1176 cloned the bios. We should never have a segment
1177 truncated at this point */
aafb230e 1178 rbd_assert(seg_len == len);
602adf40
YS
1179
1180 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1181 seg_name, seg_ofs, seg_len,
1182 bio,
1183 NULL, 0,
1184 flags,
1185 ops,
1fec7093 1186 coll, coll_index,
59c2be1e 1187 rbd_req_cb, 0, NULL);
11f77002
SW
1188
1189 rbd_destroy_ops(ops);
602adf40
YS
1190done:
1191 kfree(seg_name);
1192 return ret;
1193}
1194
1195/*
1196 * Request async osd write
1197 */
1198static int rbd_req_write(struct request *rq,
1199 struct rbd_device *rbd_dev,
1200 struct ceph_snap_context *snapc,
1201 u64 ofs, u64 len,
1fec7093
YS
1202 struct bio *bio,
1203 struct rbd_req_coll *coll,
1204 int coll_index)
602adf40
YS
1205{
1206 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1207 CEPH_OSD_OP_WRITE,
1208 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1fec7093 1209 ofs, len, bio, coll, coll_index);
602adf40
YS
1210}
1211
1212/*
1213 * Request async osd read
1214 */
1215static int rbd_req_read(struct request *rq,
1216 struct rbd_device *rbd_dev,
1217 u64 snapid,
1218 u64 ofs, u64 len,
1fec7093
YS
1219 struct bio *bio,
1220 struct rbd_req_coll *coll,
1221 int coll_index)
602adf40
YS
1222{
1223 return rbd_do_op(rq, rbd_dev, NULL,
b06e6a6b 1224 snapid,
602adf40
YS
1225 CEPH_OSD_OP_READ,
1226 CEPH_OSD_FLAG_READ,
1fec7093 1227 ofs, len, bio, coll, coll_index);
602adf40
YS
1228}
1229
1230/*
1231 * Request sync osd read
1232 */
0ce1a794 1233static int rbd_req_sync_read(struct rbd_device *rbd_dev,
602adf40 1234 u64 snapid,
aded07ea 1235 const char *object_name,
602adf40 1236 u64 ofs, u64 len,
59c2be1e
YS
1237 char *buf,
1238 u64 *ver)
602adf40 1239{
913d2fdc
AE
1240 struct ceph_osd_req_op *ops;
1241 int ret;
1242
1243 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1244 if (!ops)
1245 return -ENOMEM;
1246
1247 ret = rbd_req_sync_op(rbd_dev, NULL,
b06e6a6b 1248 snapid,
602adf40 1249 CEPH_OSD_FLAG_READ,
913d2fdc
AE
1250 ops, object_name, ofs, len, buf, NULL, ver);
1251 rbd_destroy_ops(ops);
1252
1253 return ret;
602adf40
YS
1254}
1255
1256/*
59c2be1e
YS
1257 * Request sync osd watch
1258 */
0ce1a794 1259static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
59c2be1e 1260 u64 ver,
7f0a24d8 1261 u64 notify_id)
59c2be1e
YS
1262{
1263 struct ceph_osd_req_op *ops;
11f77002
SW
1264 int ret;
1265
57cfc106
AE
1266 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1267 if (!ops)
1268 return -ENOMEM;
59c2be1e 1269
a71b891b 1270 ops[0].watch.ver = cpu_to_le64(ver);
59c2be1e
YS
1271 ops[0].watch.cookie = notify_id;
1272 ops[0].watch.flag = 0;
1273
0ce1a794 1274 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
7f0a24d8 1275 rbd_dev->header_name, 0, 0, NULL,
ad4f232f 1276 NULL, 0,
59c2be1e
YS
1277 CEPH_OSD_FLAG_READ,
1278 ops,
1fec7093 1279 NULL, 0,
59c2be1e
YS
1280 rbd_simple_req_cb, 0, NULL);
1281
1282 rbd_destroy_ops(ops);
1283 return ret;
1284}
1285
1286static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1287{
0ce1a794 1288 struct rbd_device *rbd_dev = (struct rbd_device *)data;
a71b891b 1289 u64 hver;
13143d2d
SW
1290 int rc;
1291
0ce1a794 1292 if (!rbd_dev)
59c2be1e
YS
1293 return;
1294
bd919d45
AE
1295 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1296 rbd_dev->header_name, (unsigned long long) notify_id,
1297 (unsigned int) opcode);
1fe5e993 1298 rc = rbd_refresh_header(rbd_dev, &hver);
13143d2d 1299 if (rc)
f0f8cef5 1300 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
0ce1a794 1301 " update snaps: %d\n", rbd_dev->major, rc);
59c2be1e 1302
7f0a24d8 1303 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
59c2be1e
YS
1304}
1305
1306/*
1307 * Request sync osd watch
1308 */
0e6f322d 1309static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
59c2be1e
YS
1310{
1311 struct ceph_osd_req_op *ops;
0ce1a794 1312 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
57cfc106 1313 int ret;
59c2be1e 1314
57cfc106
AE
1315 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1316 if (!ops)
1317 return -ENOMEM;
59c2be1e
YS
1318
1319 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
0ce1a794 1320 (void *)rbd_dev, &rbd_dev->watch_event);
59c2be1e
YS
1321 if (ret < 0)
1322 goto fail;
1323
0e6f322d 1324 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
0ce1a794 1325 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
59c2be1e
YS
1326 ops[0].watch.flag = 1;
1327
0ce1a794 1328 ret = rbd_req_sync_op(rbd_dev, NULL,
59c2be1e 1329 CEPH_NOSNAP,
59c2be1e
YS
1330 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1331 ops,
0e6f322d
AE
1332 rbd_dev->header_name,
1333 0, 0, NULL,
0ce1a794 1334 &rbd_dev->watch_request, NULL);
59c2be1e
YS
1335
1336 if (ret < 0)
1337 goto fail_event;
1338
1339 rbd_destroy_ops(ops);
1340 return 0;
1341
1342fail_event:
0ce1a794
AE
1343 ceph_osdc_cancel_event(rbd_dev->watch_event);
1344 rbd_dev->watch_event = NULL;
59c2be1e
YS
1345fail:
1346 rbd_destroy_ops(ops);
1347 return ret;
1348}
1349
79e3057c
YS
1350/*
1351 * Request sync osd unwatch
1352 */
070c633f 1353static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
79e3057c
YS
1354{
1355 struct ceph_osd_req_op *ops;
57cfc106 1356 int ret;
79e3057c 1357
57cfc106
AE
1358 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1359 if (!ops)
1360 return -ENOMEM;
79e3057c
YS
1361
1362 ops[0].watch.ver = 0;
0ce1a794 1363 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
79e3057c
YS
1364 ops[0].watch.flag = 0;
1365
0ce1a794 1366 ret = rbd_req_sync_op(rbd_dev, NULL,
79e3057c 1367 CEPH_NOSNAP,
79e3057c
YS
1368 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1369 ops,
070c633f
AE
1370 rbd_dev->header_name,
1371 0, 0, NULL, NULL, NULL);
1372
79e3057c
YS
1373
1374 rbd_destroy_ops(ops);
0ce1a794
AE
1375 ceph_osdc_cancel_event(rbd_dev->watch_event);
1376 rbd_dev->watch_event = NULL;
79e3057c
YS
1377 return ret;
1378}
1379
602adf40 1380/*
3cb4a687 1381 * Synchronous osd object method call
602adf40 1382 */
0ce1a794 1383static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
aded07ea
AE
1384 const char *object_name,
1385 const char *class_name,
1386 const char *method_name,
3cb4a687
AE
1387 const char *outbound,
1388 size_t outbound_size,
f8d4de6e
AE
1389 char *inbound,
1390 size_t inbound_size,
3cb4a687 1391 int flags,
59c2be1e 1392 u64 *ver)
602adf40
YS
1393{
1394 struct ceph_osd_req_op *ops;
aded07ea
AE
1395 int class_name_len = strlen(class_name);
1396 int method_name_len = strlen(method_name);
3cb4a687 1397 int payload_size;
57cfc106
AE
1398 int ret;
1399
3cb4a687
AE
1400 /*
1401 * Any input parameters required by the method we're calling
1402 * will be sent along with the class and method names as
1403 * part of the message payload. That data and its size are
1404 * supplied via the indata and indata_len fields (named from
1405 * the perspective of the server side) in the OSD request
1406 * operation.
1407 */
1408 payload_size = class_name_len + method_name_len + outbound_size;
1409 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
57cfc106
AE
1410 if (!ops)
1411 return -ENOMEM;
602adf40 1412
aded07ea
AE
1413 ops[0].cls.class_name = class_name;
1414 ops[0].cls.class_len = (__u8) class_name_len;
1415 ops[0].cls.method_name = method_name;
1416 ops[0].cls.method_len = (__u8) method_name_len;
602adf40 1417 ops[0].cls.argc = 0;
3cb4a687
AE
1418 ops[0].cls.indata = outbound;
1419 ops[0].cls.indata_len = outbound_size;
602adf40 1420
0ce1a794 1421 ret = rbd_req_sync_op(rbd_dev, NULL,
602adf40 1422 CEPH_NOSNAP,
3cb4a687 1423 flags, ops,
f8d4de6e
AE
1424 object_name, 0, inbound_size, inbound,
1425 NULL, ver);
602adf40
YS
1426
1427 rbd_destroy_ops(ops);
1428
1429 dout("cls_exec returned %d\n", ret);
1430 return ret;
1431}
1432
1fec7093
YS
1433static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1434{
1435 struct rbd_req_coll *coll =
1436 kzalloc(sizeof(struct rbd_req_coll) +
1437 sizeof(struct rbd_req_status) * num_reqs,
1438 GFP_ATOMIC);
1439
1440 if (!coll)
1441 return NULL;
1442 coll->total = num_reqs;
1443 kref_init(&coll->kref);
1444 return coll;
1445}
1446
602adf40
YS
1447/*
1448 * block device queue callback
1449 */
1450static void rbd_rq_fn(struct request_queue *q)
1451{
1452 struct rbd_device *rbd_dev = q->queuedata;
1453 struct request *rq;
1454 struct bio_pair *bp = NULL;
1455
00f1f36f 1456 while ((rq = blk_fetch_request(q))) {
602adf40
YS
1457 struct bio *bio;
1458 struct bio *rq_bio, *next_bio = NULL;
1459 bool do_write;
bd919d45
AE
1460 unsigned int size;
1461 u64 op_size = 0;
602adf40 1462 u64 ofs;
1fec7093
YS
1463 int num_segs, cur_seg = 0;
1464 struct rbd_req_coll *coll;
d1d25646 1465 struct ceph_snap_context *snapc;
602adf40 1466
602adf40
YS
1467 dout("fetched request\n");
1468
1469 /* filter out block requests we don't understand */
1470 if ((rq->cmd_type != REQ_TYPE_FS)) {
1471 __blk_end_request_all(rq, 0);
00f1f36f 1472 continue;
602adf40
YS
1473 }
1474
1475 /* deduce our operation (read, write) */
1476 do_write = (rq_data_dir(rq) == WRITE);
1477
1478 size = blk_rq_bytes(rq);
593a9e7b 1479 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
602adf40 1480 rq_bio = rq->bio;
f84344f3 1481 if (do_write && rbd_dev->mapping.read_only) {
602adf40 1482 __blk_end_request_all(rq, -EROFS);
00f1f36f 1483 continue;
602adf40
YS
1484 }
1485
1486 spin_unlock_irq(q->queue_lock);
1487
d1d25646 1488 down_read(&rbd_dev->header_rwsem);
e88a36ec 1489
f84344f3
AE
1490 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1491 !rbd_dev->mapping.snap_exists) {
e88a36ec 1492 up_read(&rbd_dev->header_rwsem);
d1d25646
JD
1493 dout("request for non-existent snapshot");
1494 spin_lock_irq(q->queue_lock);
1495 __blk_end_request_all(rq, -ENXIO);
1496 continue;
e88a36ec
JD
1497 }
1498
d1d25646
JD
1499 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1500
1501 up_read(&rbd_dev->header_rwsem);
1502
602adf40
YS
1503 dout("%s 0x%x bytes at 0x%llx\n",
1504 do_write ? "write" : "read",
bd919d45 1505 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
602adf40 1506
1fec7093 1507 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
df111be6
AE
1508 if (num_segs <= 0) {
1509 spin_lock_irq(q->queue_lock);
1510 __blk_end_request_all(rq, num_segs);
1511 ceph_put_snap_context(snapc);
1512 continue;
1513 }
1fec7093
YS
1514 coll = rbd_alloc_coll(num_segs);
1515 if (!coll) {
1516 spin_lock_irq(q->queue_lock);
1517 __blk_end_request_all(rq, -ENOMEM);
d1d25646 1518 ceph_put_snap_context(snapc);
00f1f36f 1519 continue;
1fec7093
YS
1520 }
1521
602adf40
YS
1522 do {
1523 /* a bio clone to be passed down to OSD req */
bd919d45 1524 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
65ccfe21 1525 op_size = rbd_segment_length(rbd_dev, ofs, size);
1fec7093 1526 kref_get(&coll->kref);
602adf40
YS
1527 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1528 op_size, GFP_ATOMIC);
1529 if (!bio) {
1fec7093
YS
1530 rbd_coll_end_req_index(rq, coll, cur_seg,
1531 -ENOMEM, op_size);
1532 goto next_seg;
602adf40
YS
1533 }
1534
1fec7093 1535
602adf40
YS
1536 /* init OSD command: write or read */
1537 if (do_write)
1538 rbd_req_write(rq, rbd_dev,
d1d25646 1539 snapc,
602adf40 1540 ofs,
1fec7093
YS
1541 op_size, bio,
1542 coll, cur_seg);
602adf40
YS
1543 else
1544 rbd_req_read(rq, rbd_dev,
f84344f3 1545 rbd_dev->mapping.snap_id,
602adf40 1546 ofs,
1fec7093
YS
1547 op_size, bio,
1548 coll, cur_seg);
602adf40 1549
1fec7093 1550next_seg:
602adf40
YS
1551 size -= op_size;
1552 ofs += op_size;
1553
1fec7093 1554 cur_seg++;
602adf40
YS
1555 rq_bio = next_bio;
1556 } while (size > 0);
1fec7093 1557 kref_put(&coll->kref, rbd_coll_release);
602adf40
YS
1558
1559 if (bp)
1560 bio_pair_release(bp);
602adf40 1561 spin_lock_irq(q->queue_lock);
d1d25646
JD
1562
1563 ceph_put_snap_context(snapc);
602adf40
YS
1564 }
1565}
1566
1567/*
1568 * a queue callback. Makes sure that we don't create a bio that spans across
1569 * multiple osd objects. One exception would be with a single page bios,
1570 * which we handle later at bio_chain_clone
1571 */
1572static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1573 struct bio_vec *bvec)
1574{
1575 struct rbd_device *rbd_dev = q->queuedata;
593a9e7b
AE
1576 unsigned int chunk_sectors;
1577 sector_t sector;
1578 unsigned int bio_sectors;
602adf40
YS
1579 int max;
1580
593a9e7b
AE
1581 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1582 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1583 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1584
602adf40 1585 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
593a9e7b 1586 + bio_sectors)) << SECTOR_SHIFT;
602adf40
YS
1587 if (max < 0)
1588 max = 0; /* bio_add cannot handle a negative return */
1589 if (max <= bvec->bv_len && bio_sectors == 0)
1590 return bvec->bv_len;
1591 return max;
1592}
1593
1594static void rbd_free_disk(struct rbd_device *rbd_dev)
1595{
1596 struct gendisk *disk = rbd_dev->disk;
1597
1598 if (!disk)
1599 return;
1600
602adf40
YS
1601 if (disk->flags & GENHD_FL_UP)
1602 del_gendisk(disk);
1603 if (disk->queue)
1604 blk_cleanup_queue(disk->queue);
1605 put_disk(disk);
1606}
1607
1608/*
4156d998
AE
1609 * Read the complete header for the given rbd device.
1610 *
1611 * Returns a pointer to a dynamically-allocated buffer containing
1612 * the complete and validated header. Caller can pass the address
1613 * of a variable that will be filled in with the version of the
1614 * header object at the time it was read.
1615 *
1616 * Returns a pointer-coded errno if a failure occurs.
602adf40 1617 */
4156d998
AE
1618static struct rbd_image_header_ondisk *
1619rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
602adf40 1620{
4156d998 1621 struct rbd_image_header_ondisk *ondisk = NULL;
50f7c4c9 1622 u32 snap_count = 0;
4156d998
AE
1623 u64 names_size = 0;
1624 u32 want_count;
1625 int ret;
602adf40 1626
00f1f36f 1627 /*
4156d998
AE
1628 * The complete header will include an array of its 64-bit
1629 * snapshot ids, followed by the names of those snapshots as
1630 * a contiguous block of NUL-terminated strings. Note that
1631 * the number of snapshots could change by the time we read
1632 * it in, in which case we re-read it.
00f1f36f 1633 */
4156d998
AE
1634 do {
1635 size_t size;
1636
1637 kfree(ondisk);
1638
1639 size = sizeof (*ondisk);
1640 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1641 size += names_size;
1642 ondisk = kmalloc(size, GFP_KERNEL);
1643 if (!ondisk)
1644 return ERR_PTR(-ENOMEM);
1645
1646 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
0bed54dc 1647 rbd_dev->header_name,
4156d998
AE
1648 0, size,
1649 (char *) ondisk, version);
1650
1651 if (ret < 0)
1652 goto out_err;
1653 if (WARN_ON((size_t) ret < size)) {
1654 ret = -ENXIO;
1655 pr_warning("short header read for image %s"
1656 " (want %zd got %d)\n",
1657 rbd_dev->image_name, size, ret);
1658 goto out_err;
1659 }
1660 if (!rbd_dev_ondisk_valid(ondisk)) {
1661 ret = -ENXIO;
1662 pr_warning("invalid header for image %s\n",
1663 rbd_dev->image_name);
1664 goto out_err;
81e759fb 1665 }
602adf40 1666
4156d998
AE
1667 names_size = le64_to_cpu(ondisk->snap_names_len);
1668 want_count = snap_count;
1669 snap_count = le32_to_cpu(ondisk->snap_count);
1670 } while (snap_count != want_count);
00f1f36f 1671
4156d998 1672 return ondisk;
00f1f36f 1673
4156d998
AE
1674out_err:
1675 kfree(ondisk);
1676
1677 return ERR_PTR(ret);
1678}
1679
1680/*
1681 * reload the ondisk the header
1682 */
1683static int rbd_read_header(struct rbd_device *rbd_dev,
1684 struct rbd_image_header *header)
1685{
1686 struct rbd_image_header_ondisk *ondisk;
1687 u64 ver = 0;
1688 int ret;
602adf40 1689
4156d998
AE
1690 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1691 if (IS_ERR(ondisk))
1692 return PTR_ERR(ondisk);
1693 ret = rbd_header_from_disk(header, ondisk);
1694 if (ret >= 0)
1695 header->obj_version = ver;
1696 kfree(ondisk);
1697
1698 return ret;
602adf40
YS
1699}
1700
dfc5606d
YS
1701static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1702{
1703 struct rbd_snap *snap;
a0593290 1704 struct rbd_snap *next;
dfc5606d 1705
a0593290 1706 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
14e7085d 1707 __rbd_remove_snap_dev(snap);
dfc5606d
YS
1708}
1709
602adf40
YS
1710/*
1711 * only read the first part of the ondisk header, without the snaps info
1712 */
b813623a 1713static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
602adf40
YS
1714{
1715 int ret;
1716 struct rbd_image_header h;
602adf40
YS
1717
1718 ret = rbd_read_header(rbd_dev, &h);
1719 if (ret < 0)
1720 return ret;
1721
a51aa0c0
JD
1722 down_write(&rbd_dev->header_rwsem);
1723
9db4b3e3 1724 /* resized? */
f84344f3 1725 if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) {
474ef7ce
JD
1726 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1727
99c1f08f
AE
1728 if (size != (sector_t) rbd_dev->mapping.size) {
1729 dout("setting size to %llu sectors",
1730 (unsigned long long) size);
1731 rbd_dev->mapping.size = (u64) size;
1732 set_capacity(rbd_dev->disk, size);
1733 }
474ef7ce 1734 }
9db4b3e3 1735
849b4260 1736 /* rbd_dev->header.object_prefix shouldn't change */
602adf40 1737 kfree(rbd_dev->header.snap_sizes);
849b4260 1738 kfree(rbd_dev->header.snap_names);
d1d25646
JD
1739 /* osd requests may still refer to snapc */
1740 ceph_put_snap_context(rbd_dev->header.snapc);
602adf40 1741
b813623a
AE
1742 if (hver)
1743 *hver = h.obj_version;
a71b891b 1744 rbd_dev->header.obj_version = h.obj_version;
93a24e08 1745 rbd_dev->header.image_size = h.image_size;
602adf40
YS
1746 rbd_dev->header.snapc = h.snapc;
1747 rbd_dev->header.snap_names = h.snap_names;
1748 rbd_dev->header.snap_sizes = h.snap_sizes;
849b4260
AE
1749 /* Free the extra copy of the object prefix */
1750 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1751 kfree(h.object_prefix);
1752
304f6808
AE
1753 ret = rbd_dev_snaps_update(rbd_dev);
1754 if (!ret)
1755 ret = rbd_dev_snaps_register(rbd_dev);
dfc5606d 1756
c666601a 1757 up_write(&rbd_dev->header_rwsem);
602adf40 1758
dfc5606d 1759 return ret;
602adf40
YS
1760}
1761
1fe5e993
AE
1762static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1763{
1764 int ret;
1765
1766 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1767 ret = __rbd_refresh_header(rbd_dev, hver);
1768 mutex_unlock(&ctl_mutex);
1769
1770 return ret;
1771}
1772
602adf40
YS
1773static int rbd_init_disk(struct rbd_device *rbd_dev)
1774{
1775 struct gendisk *disk;
1776 struct request_queue *q;
593a9e7b 1777 u64 segment_size;
602adf40 1778
602adf40 1779 /* create gendisk info */
602adf40
YS
1780 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1781 if (!disk)
1fcdb8aa 1782 return -ENOMEM;
602adf40 1783
f0f8cef5 1784 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
de71a297 1785 rbd_dev->dev_id);
602adf40
YS
1786 disk->major = rbd_dev->major;
1787 disk->first_minor = 0;
1788 disk->fops = &rbd_bd_ops;
1789 disk->private_data = rbd_dev;
1790
1791 /* init rq */
602adf40
YS
1792 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1793 if (!q)
1794 goto out_disk;
029bcbd8 1795
593a9e7b
AE
1796 /* We use the default size, but let's be explicit about it. */
1797 blk_queue_physical_block_size(q, SECTOR_SIZE);
1798
029bcbd8 1799 /* set io sizes to object size */
593a9e7b
AE
1800 segment_size = rbd_obj_bytes(&rbd_dev->header);
1801 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1802 blk_queue_max_segment_size(q, segment_size);
1803 blk_queue_io_min(q, segment_size);
1804 blk_queue_io_opt(q, segment_size);
029bcbd8 1805
602adf40
YS
1806 blk_queue_merge_bvec(q, rbd_merge_bvec);
1807 disk->queue = q;
1808
1809 q->queuedata = rbd_dev;
1810
1811 rbd_dev->disk = disk;
602adf40 1812
12f02944
AE
1813 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1814
602adf40 1815 return 0;
602adf40
YS
1816out_disk:
1817 put_disk(disk);
1fcdb8aa
AE
1818
1819 return -ENOMEM;
602adf40
YS
1820}
1821
dfc5606d
YS
1822/*
1823 sysfs
1824*/
1825
593a9e7b
AE
1826static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1827{
1828 return container_of(dev, struct rbd_device, dev);
1829}
1830
dfc5606d
YS
1831static ssize_t rbd_size_show(struct device *dev,
1832 struct device_attribute *attr, char *buf)
1833{
593a9e7b 1834 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
a51aa0c0
JD
1835 sector_t size;
1836
1837 down_read(&rbd_dev->header_rwsem);
1838 size = get_capacity(rbd_dev->disk);
1839 up_read(&rbd_dev->header_rwsem);
dfc5606d 1840
a51aa0c0 1841 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
dfc5606d
YS
1842}
1843
34b13184
AE
1844/*
1845 * Note this shows the features for whatever's mapped, which is not
1846 * necessarily the base image.
1847 */
1848static ssize_t rbd_features_show(struct device *dev,
1849 struct device_attribute *attr, char *buf)
1850{
1851 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1852
1853 return sprintf(buf, "0x%016llx\n",
1854 (unsigned long long) rbd_dev->mapping.features);
1855}
1856
dfc5606d
YS
1857static ssize_t rbd_major_show(struct device *dev,
1858 struct device_attribute *attr, char *buf)
1859{
593a9e7b 1860 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 1861
dfc5606d
YS
1862 return sprintf(buf, "%d\n", rbd_dev->major);
1863}
1864
1865static ssize_t rbd_client_id_show(struct device *dev,
1866 struct device_attribute *attr, char *buf)
602adf40 1867{
593a9e7b 1868 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1869
1dbb4399
AE
1870 return sprintf(buf, "client%lld\n",
1871 ceph_client_id(rbd_dev->rbd_client->client));
602adf40
YS
1872}
1873
dfc5606d
YS
1874static ssize_t rbd_pool_show(struct device *dev,
1875 struct device_attribute *attr, char *buf)
602adf40 1876{
593a9e7b 1877 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d
YS
1878
1879 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1880}
1881
9bb2f334
AE
1882static ssize_t rbd_pool_id_show(struct device *dev,
1883 struct device_attribute *attr, char *buf)
1884{
1885 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1886
1887 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1888}
1889
dfc5606d
YS
1890static ssize_t rbd_name_show(struct device *dev,
1891 struct device_attribute *attr, char *buf)
1892{
593a9e7b 1893 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1894
0bed54dc 1895 return sprintf(buf, "%s\n", rbd_dev->image_name);
dfc5606d
YS
1896}
1897
589d30e0
AE
1898static ssize_t rbd_image_id_show(struct device *dev,
1899 struct device_attribute *attr, char *buf)
1900{
1901 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1902
1903 return sprintf(buf, "%s\n", rbd_dev->image_id);
1904}
1905
34b13184
AE
1906/*
1907 * Shows the name of the currently-mapped snapshot (or
1908 * RBD_SNAP_HEAD_NAME for the base image).
1909 */
dfc5606d
YS
1910static ssize_t rbd_snap_show(struct device *dev,
1911 struct device_attribute *attr,
1912 char *buf)
1913{
593a9e7b 1914 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
dfc5606d 1915
f84344f3 1916 return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
dfc5606d
YS
1917}
1918
1919static ssize_t rbd_image_refresh(struct device *dev,
1920 struct device_attribute *attr,
1921 const char *buf,
1922 size_t size)
1923{
593a9e7b 1924 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
b813623a 1925 int ret;
602adf40 1926
1fe5e993 1927 ret = rbd_refresh_header(rbd_dev, NULL);
b813623a
AE
1928
1929 return ret < 0 ? ret : size;
dfc5606d 1930}
602adf40 1931
dfc5606d 1932static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
34b13184 1933static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
dfc5606d
YS
1934static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1935static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1936static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
9bb2f334 1937static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
dfc5606d 1938static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
589d30e0 1939static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
dfc5606d
YS
1940static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1941static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
dfc5606d
YS
1942
1943static struct attribute *rbd_attrs[] = {
1944 &dev_attr_size.attr,
34b13184 1945 &dev_attr_features.attr,
dfc5606d
YS
1946 &dev_attr_major.attr,
1947 &dev_attr_client_id.attr,
1948 &dev_attr_pool.attr,
9bb2f334 1949 &dev_attr_pool_id.attr,
dfc5606d 1950 &dev_attr_name.attr,
589d30e0 1951 &dev_attr_image_id.attr,
dfc5606d
YS
1952 &dev_attr_current_snap.attr,
1953 &dev_attr_refresh.attr,
dfc5606d
YS
1954 NULL
1955};
1956
1957static struct attribute_group rbd_attr_group = {
1958 .attrs = rbd_attrs,
1959};
1960
1961static const struct attribute_group *rbd_attr_groups[] = {
1962 &rbd_attr_group,
1963 NULL
1964};
1965
1966static void rbd_sysfs_dev_release(struct device *dev)
1967{
1968}
1969
1970static struct device_type rbd_device_type = {
1971 .name = "rbd",
1972 .groups = rbd_attr_groups,
1973 .release = rbd_sysfs_dev_release,
1974};
1975
1976
1977/*
1978 sysfs - snapshots
1979*/
1980
1981static ssize_t rbd_snap_size_show(struct device *dev,
1982 struct device_attribute *attr,
1983 char *buf)
1984{
1985 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1986
3591538f 1987 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
dfc5606d
YS
1988}
1989
1990static ssize_t rbd_snap_id_show(struct device *dev,
1991 struct device_attribute *attr,
1992 char *buf)
1993{
1994 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1995
3591538f 1996 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
dfc5606d
YS
1997}
1998
34b13184
AE
1999static ssize_t rbd_snap_features_show(struct device *dev,
2000 struct device_attribute *attr,
2001 char *buf)
2002{
2003 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2004
2005 return sprintf(buf, "0x%016llx\n",
2006 (unsigned long long) snap->features);
2007}
2008
dfc5606d
YS
2009static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2010static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
34b13184 2011static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
dfc5606d
YS
2012
2013static struct attribute *rbd_snap_attrs[] = {
2014 &dev_attr_snap_size.attr,
2015 &dev_attr_snap_id.attr,
34b13184 2016 &dev_attr_snap_features.attr,
dfc5606d
YS
2017 NULL,
2018};
2019
2020static struct attribute_group rbd_snap_attr_group = {
2021 .attrs = rbd_snap_attrs,
2022};
2023
2024static void rbd_snap_dev_release(struct device *dev)
2025{
2026 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2027 kfree(snap->name);
2028 kfree(snap);
2029}
2030
2031static const struct attribute_group *rbd_snap_attr_groups[] = {
2032 &rbd_snap_attr_group,
2033 NULL
2034};
2035
2036static struct device_type rbd_snap_device_type = {
2037 .groups = rbd_snap_attr_groups,
2038 .release = rbd_snap_dev_release,
2039};
2040
304f6808
AE
2041static bool rbd_snap_registered(struct rbd_snap *snap)
2042{
2043 bool ret = snap->dev.type == &rbd_snap_device_type;
2044 bool reg = device_is_registered(&snap->dev);
2045
2046 rbd_assert(!ret ^ reg);
2047
2048 return ret;
2049}
2050
14e7085d 2051static void __rbd_remove_snap_dev(struct rbd_snap *snap)
dfc5606d
YS
2052{
2053 list_del(&snap->node);
304f6808
AE
2054 if (device_is_registered(&snap->dev))
2055 device_unregister(&snap->dev);
dfc5606d
YS
2056}
2057
14e7085d 2058static int rbd_register_snap_dev(struct rbd_snap *snap,
dfc5606d
YS
2059 struct device *parent)
2060{
2061 struct device *dev = &snap->dev;
2062 int ret;
2063
2064 dev->type = &rbd_snap_device_type;
2065 dev->parent = parent;
2066 dev->release = rbd_snap_dev_release;
2067 dev_set_name(dev, "snap_%s", snap->name);
304f6808
AE
2068 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2069
dfc5606d
YS
2070 ret = device_register(dev);
2071
2072 return ret;
2073}
2074
4e891e0a 2075static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
c8d18425 2076 const char *snap_name,
34b13184
AE
2077 u64 snap_id, u64 snap_size,
2078 u64 snap_features)
dfc5606d 2079{
4e891e0a 2080 struct rbd_snap *snap;
dfc5606d 2081 int ret;
4e891e0a
AE
2082
2083 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
dfc5606d 2084 if (!snap)
4e891e0a
AE
2085 return ERR_PTR(-ENOMEM);
2086
2087 ret = -ENOMEM;
c8d18425 2088 snap->name = kstrdup(snap_name, GFP_KERNEL);
4e891e0a
AE
2089 if (!snap->name)
2090 goto err;
2091
c8d18425
AE
2092 snap->id = snap_id;
2093 snap->size = snap_size;
34b13184 2094 snap->features = snap_features;
4e891e0a
AE
2095
2096 return snap;
2097
dfc5606d
YS
2098err:
2099 kfree(snap->name);
2100 kfree(snap);
4e891e0a
AE
2101
2102 return ERR_PTR(ret);
dfc5606d
YS
2103}
2104
cd892126
AE
2105static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2106 u64 *snap_size, u64 *snap_features)
2107{
2108 char *snap_name;
2109
2110 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2111
2112 *snap_size = rbd_dev->header.snap_sizes[which];
2113 *snap_features = 0; /* No features for v1 */
2114
2115 /* Skip over names until we find the one we are looking for */
2116
2117 snap_name = rbd_dev->header.snap_names;
2118 while (which--)
2119 snap_name += strlen(snap_name) + 1;
2120
2121 return snap_name;
2122}
2123
dfc5606d 2124/*
35938150
AE
2125 * Scan the rbd device's current snapshot list and compare it to the
2126 * newly-received snapshot context. Remove any existing snapshots
2127 * not present in the new snapshot context. Add a new snapshot for
2128 * any snaphots in the snapshot context not in the current list.
2129 * And verify there are no changes to snapshots we already know
2130 * about.
2131 *
2132 * Assumes the snapshots in the snapshot context are sorted by
2133 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2134 * are also maintained in that order.)
dfc5606d 2135 */
304f6808 2136static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
dfc5606d 2137{
35938150
AE
2138 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2139 const u32 snap_count = snapc->num_snaps;
35938150
AE
2140 struct list_head *head = &rbd_dev->snaps;
2141 struct list_head *links = head->next;
2142 u32 index = 0;
dfc5606d 2143
9fcbb800 2144 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
35938150
AE
2145 while (index < snap_count || links != head) {
2146 u64 snap_id;
2147 struct rbd_snap *snap;
cd892126
AE
2148 char *snap_name;
2149 u64 snap_size = 0;
2150 u64 snap_features = 0;
dfc5606d 2151
35938150
AE
2152 snap_id = index < snap_count ? snapc->snaps[index]
2153 : CEPH_NOSNAP;
2154 snap = links != head ? list_entry(links, struct rbd_snap, node)
2155 : NULL;
aafb230e 2156 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
dfc5606d 2157
35938150
AE
2158 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2159 struct list_head *next = links->next;
dfc5606d 2160
35938150 2161 /* Existing snapshot not in the new snap context */
dfc5606d 2162
f84344f3
AE
2163 if (rbd_dev->mapping.snap_id == snap->id)
2164 rbd_dev->mapping.snap_exists = false;
35938150 2165 __rbd_remove_snap_dev(snap);
9fcbb800 2166 dout("%ssnap id %llu has been removed\n",
f84344f3
AE
2167 rbd_dev->mapping.snap_id == snap->id ?
2168 "mapped " : "",
9fcbb800 2169 (unsigned long long) snap->id);
35938150
AE
2170
2171 /* Done with this list entry; advance */
2172
2173 links = next;
dfc5606d
YS
2174 continue;
2175 }
35938150 2176
cd892126
AE
2177 snap_name = rbd_dev_v1_snap_info(rbd_dev, index,
2178 &snap_size, &snap_features);
2179 if (IS_ERR(snap_name))
2180 return PTR_ERR(snap_name);
2181
9fcbb800
AE
2182 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2183 (unsigned long long) snap_id);
35938150
AE
2184 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2185 struct rbd_snap *new_snap;
2186
2187 /* We haven't seen this snapshot before */
2188
c8d18425 2189 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
cd892126 2190 snap_id, snap_size, snap_features);
9fcbb800
AE
2191 if (IS_ERR(new_snap)) {
2192 int err = PTR_ERR(new_snap);
2193
2194 dout(" failed to add dev, error %d\n", err);
2195
2196 return err;
2197 }
35938150
AE
2198
2199 /* New goes before existing, or at end of list */
2200
9fcbb800 2201 dout(" added dev%s\n", snap ? "" : " at end\n");
35938150
AE
2202 if (snap)
2203 list_add_tail(&new_snap->node, &snap->node);
2204 else
523f3258 2205 list_add_tail(&new_snap->node, head);
35938150
AE
2206 } else {
2207 /* Already have this one */
2208
9fcbb800
AE
2209 dout(" already present\n");
2210
cd892126 2211 rbd_assert(snap->size == snap_size);
aafb230e 2212 rbd_assert(!strcmp(snap->name, snap_name));
cd892126 2213 rbd_assert(snap->features == snap_features);
35938150
AE
2214
2215 /* Done with this list entry; advance */
2216
2217 links = links->next;
dfc5606d 2218 }
35938150
AE
2219
2220 /* Advance to the next entry in the snapshot context */
2221
2222 index++;
dfc5606d 2223 }
9fcbb800 2224 dout("%s: done\n", __func__);
dfc5606d
YS
2225
2226 return 0;
2227}
2228
304f6808
AE
2229/*
2230 * Scan the list of snapshots and register the devices for any that
2231 * have not already been registered.
2232 */
2233static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2234{
2235 struct rbd_snap *snap;
2236 int ret = 0;
2237
2238 dout("%s called\n", __func__);
86ff77bb
AE
2239 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2240 return -EIO;
304f6808
AE
2241
2242 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2243 if (!rbd_snap_registered(snap)) {
2244 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2245 if (ret < 0)
2246 break;
2247 }
2248 }
2249 dout("%s: returning %d\n", __func__, ret);
2250
2251 return ret;
2252}
2253
dfc5606d
YS
2254static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2255{
dfc5606d 2256 struct device *dev;
cd789ab9 2257 int ret;
dfc5606d
YS
2258
2259 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
dfc5606d 2260
cd789ab9 2261 dev = &rbd_dev->dev;
dfc5606d
YS
2262 dev->bus = &rbd_bus_type;
2263 dev->type = &rbd_device_type;
2264 dev->parent = &rbd_root_dev;
2265 dev->release = rbd_dev_release;
de71a297 2266 dev_set_name(dev, "%d", rbd_dev->dev_id);
dfc5606d 2267 ret = device_register(dev);
dfc5606d 2268
dfc5606d 2269 mutex_unlock(&ctl_mutex);
cd789ab9 2270
dfc5606d 2271 return ret;
602adf40
YS
2272}
2273
dfc5606d
YS
2274static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2275{
2276 device_unregister(&rbd_dev->dev);
2277}
2278
59c2be1e
YS
2279static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2280{
2281 int ret, rc;
2282
2283 do {
0e6f322d 2284 ret = rbd_req_sync_watch(rbd_dev);
59c2be1e 2285 if (ret == -ERANGE) {
1fe5e993 2286 rc = rbd_refresh_header(rbd_dev, NULL);
59c2be1e
YS
2287 if (rc < 0)
2288 return rc;
2289 }
2290 } while (ret == -ERANGE);
2291
2292 return ret;
2293}
2294
e2839308 2295static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
1ddbe94e
AE
2296
2297/*
499afd5b
AE
2298 * Get a unique rbd identifier for the given new rbd_dev, and add
2299 * the rbd_dev to the global list. The minimum rbd id is 1.
1ddbe94e 2300 */
e2839308 2301static void rbd_dev_id_get(struct rbd_device *rbd_dev)
b7f23c36 2302{
e2839308 2303 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
499afd5b
AE
2304
2305 spin_lock(&rbd_dev_list_lock);
2306 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2307 spin_unlock(&rbd_dev_list_lock);
e2839308
AE
2308 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2309 (unsigned long long) rbd_dev->dev_id);
1ddbe94e 2310}
b7f23c36 2311
1ddbe94e 2312/*
499afd5b
AE
2313 * Remove an rbd_dev from the global list, and record that its
2314 * identifier is no longer in use.
1ddbe94e 2315 */
e2839308 2316static void rbd_dev_id_put(struct rbd_device *rbd_dev)
1ddbe94e 2317{
d184f6bf 2318 struct list_head *tmp;
de71a297 2319 int rbd_id = rbd_dev->dev_id;
d184f6bf
AE
2320 int max_id;
2321
aafb230e 2322 rbd_assert(rbd_id > 0);
499afd5b 2323
e2839308
AE
2324 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2325 (unsigned long long) rbd_dev->dev_id);
499afd5b
AE
2326 spin_lock(&rbd_dev_list_lock);
2327 list_del_init(&rbd_dev->node);
d184f6bf
AE
2328
2329 /*
2330 * If the id being "put" is not the current maximum, there
2331 * is nothing special we need to do.
2332 */
e2839308 2333 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
d184f6bf
AE
2334 spin_unlock(&rbd_dev_list_lock);
2335 return;
2336 }
2337
2338 /*
2339 * We need to update the current maximum id. Search the
2340 * list to find out what it is. We're more likely to find
2341 * the maximum at the end, so search the list backward.
2342 */
2343 max_id = 0;
2344 list_for_each_prev(tmp, &rbd_dev_list) {
2345 struct rbd_device *rbd_dev;
2346
2347 rbd_dev = list_entry(tmp, struct rbd_device, node);
2348 if (rbd_id > max_id)
2349 max_id = rbd_id;
2350 }
499afd5b 2351 spin_unlock(&rbd_dev_list_lock);
b7f23c36 2352
1ddbe94e 2353 /*
e2839308 2354 * The max id could have been updated by rbd_dev_id_get(), in
d184f6bf
AE
2355 * which case it now accurately reflects the new maximum.
2356 * Be careful not to overwrite the maximum value in that
2357 * case.
1ddbe94e 2358 */
e2839308
AE
2359 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2360 dout(" max dev id has been reset\n");
b7f23c36
AE
2361}
2362
e28fff26
AE
2363/*
2364 * Skips over white space at *buf, and updates *buf to point to the
2365 * first found non-space character (if any). Returns the length of
593a9e7b
AE
2366 * the token (string of non-white space characters) found. Note
2367 * that *buf must be terminated with '\0'.
e28fff26
AE
2368 */
2369static inline size_t next_token(const char **buf)
2370{
2371 /*
2372 * These are the characters that produce nonzero for
2373 * isspace() in the "C" and "POSIX" locales.
2374 */
2375 const char *spaces = " \f\n\r\t\v";
2376
2377 *buf += strspn(*buf, spaces); /* Find start of token */
2378
2379 return strcspn(*buf, spaces); /* Return token length */
2380}
2381
2382/*
2383 * Finds the next token in *buf, and if the provided token buffer is
2384 * big enough, copies the found token into it. The result, if
593a9e7b
AE
2385 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2386 * must be terminated with '\0' on entry.
e28fff26
AE
2387 *
2388 * Returns the length of the token found (not including the '\0').
2389 * Return value will be 0 if no token is found, and it will be >=
2390 * token_size if the token would not fit.
2391 *
593a9e7b 2392 * The *buf pointer will be updated to point beyond the end of the
e28fff26
AE
2393 * found token. Note that this occurs even if the token buffer is
2394 * too small to hold it.
2395 */
2396static inline size_t copy_token(const char **buf,
2397 char *token,
2398 size_t token_size)
2399{
2400 size_t len;
2401
2402 len = next_token(buf);
2403 if (len < token_size) {
2404 memcpy(token, *buf, len);
2405 *(token + len) = '\0';
2406 }
2407 *buf += len;
2408
2409 return len;
2410}
2411
ea3352f4
AE
2412/*
2413 * Finds the next token in *buf, dynamically allocates a buffer big
2414 * enough to hold a copy of it, and copies the token into the new
2415 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2416 * that a duplicate buffer is created even for a zero-length token.
2417 *
2418 * Returns a pointer to the newly-allocated duplicate, or a null
2419 * pointer if memory for the duplicate was not available. If
2420 * the lenp argument is a non-null pointer, the length of the token
2421 * (not including the '\0') is returned in *lenp.
2422 *
2423 * If successful, the *buf pointer will be updated to point beyond
2424 * the end of the found token.
2425 *
2426 * Note: uses GFP_KERNEL for allocation.
2427 */
2428static inline char *dup_token(const char **buf, size_t *lenp)
2429{
2430 char *dup;
2431 size_t len;
2432
2433 len = next_token(buf);
2434 dup = kmalloc(len + 1, GFP_KERNEL);
2435 if (!dup)
2436 return NULL;
2437
2438 memcpy(dup, *buf, len);
2439 *(dup + len) = '\0';
2440 *buf += len;
2441
2442 if (lenp)
2443 *lenp = len;
2444
2445 return dup;
2446}
2447
a725f65e 2448/*
3feeb894
AE
2449 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2450 * rbd_md_name, and name fields of the given rbd_dev, based on the
2451 * list of monitor addresses and other options provided via
2452 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2453 * copy of the snapshot name to map if successful, or a
2454 * pointer-coded error otherwise.
d22f76e7
AE
2455 *
2456 * Note: rbd_dev is assumed to have been initially zero-filled.
a725f65e 2457 */
3feeb894
AE
2458static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2459 const char *buf,
2460 const char **mon_addrs,
2461 size_t *mon_addrs_size,
2462 char *options,
2463 size_t options_size)
e28fff26 2464{
d22f76e7 2465 size_t len;
3feeb894
AE
2466 char *err_ptr = ERR_PTR(-EINVAL);
2467 char *snap_name;
e28fff26
AE
2468
2469 /* The first four tokens are required */
2470
7ef3214a
AE
2471 len = next_token(&buf);
2472 if (!len)
3feeb894 2473 return err_ptr;
5214ecc4 2474 *mon_addrs_size = len + 1;
7ef3214a
AE
2475 *mon_addrs = buf;
2476
2477 buf += len;
a725f65e 2478
e28fff26
AE
2479 len = copy_token(&buf, options, options_size);
2480 if (!len || len >= options_size)
3feeb894 2481 return err_ptr;
e28fff26 2482
3feeb894 2483 err_ptr = ERR_PTR(-ENOMEM);
d22f76e7
AE
2484 rbd_dev->pool_name = dup_token(&buf, NULL);
2485 if (!rbd_dev->pool_name)
d22f76e7 2486 goto out_err;
e28fff26 2487
0bed54dc
AE
2488 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2489 if (!rbd_dev->image_name)
bf3e5ae1 2490 goto out_err;
a725f65e 2491
3feeb894
AE
2492 /* Snapshot name is optional */
2493 len = next_token(&buf);
820a5f3e 2494 if (!len) {
3feeb894
AE
2495 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2496 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
849b4260 2497 }
3feeb894
AE
2498 snap_name = kmalloc(len + 1, GFP_KERNEL);
2499 if (!snap_name)
2500 goto out_err;
2501 memcpy(snap_name, buf, len);
2502 *(snap_name + len) = '\0';
e28fff26 2503
3feeb894
AE
2504dout(" SNAP_NAME is <%s>, len is %zd\n", snap_name, len);
2505
2506 return snap_name;
d22f76e7
AE
2507
2508out_err:
0bed54dc 2509 kfree(rbd_dev->image_name);
d78fd7ae
AE
2510 rbd_dev->image_name = NULL;
2511 rbd_dev->image_name_len = 0;
d22f76e7
AE
2512 kfree(rbd_dev->pool_name);
2513 rbd_dev->pool_name = NULL;
2514
3feeb894 2515 return err_ptr;
a725f65e
AE
2516}
2517
589d30e0
AE
2518/*
2519 * An rbd format 2 image has a unique identifier, distinct from the
2520 * name given to it by the user. Internally, that identifier is
2521 * what's used to specify the names of objects related to the image.
2522 *
2523 * A special "rbd id" object is used to map an rbd image name to its
2524 * id. If that object doesn't exist, then there is no v2 rbd image
2525 * with the supplied name.
2526 *
2527 * This function will record the given rbd_dev's image_id field if
2528 * it can be determined, and in that case will return 0. If any
2529 * errors occur a negative errno will be returned and the rbd_dev's
2530 * image_id field will be unchanged (and should be NULL).
2531 */
2532static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2533{
2534 int ret;
2535 size_t size;
2536 char *object_name;
2537 void *response;
2538 void *p;
2539
2540 /*
2541 * First, see if the format 2 image id file exists, and if
2542 * so, get the image's persistent id from it.
2543 */
2544 size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2545 object_name = kmalloc(size, GFP_NOIO);
2546 if (!object_name)
2547 return -ENOMEM;
2548 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2549 dout("rbd id object name is %s\n", object_name);
2550
2551 /* Response will be an encoded string, which includes a length */
2552
2553 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2554 response = kzalloc(size, GFP_NOIO);
2555 if (!response) {
2556 ret = -ENOMEM;
2557 goto out;
2558 }
2559
2560 ret = rbd_req_sync_exec(rbd_dev, object_name,
2561 "rbd", "get_id",
2562 NULL, 0,
2563 response, RBD_IMAGE_ID_LEN_MAX,
2564 CEPH_OSD_FLAG_READ, NULL);
2565 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2566 if (ret < 0)
2567 goto out;
2568
2569 p = response;
2570 rbd_dev->image_id = ceph_extract_encoded_string(&p,
2571 p + RBD_IMAGE_ID_LEN_MAX,
2572 &rbd_dev->image_id_len,
2573 GFP_NOIO);
2574 if (IS_ERR(rbd_dev->image_id)) {
2575 ret = PTR_ERR(rbd_dev->image_id);
2576 rbd_dev->image_id = NULL;
2577 } else {
2578 dout("image_id is %s\n", rbd_dev->image_id);
2579 }
2580out:
2581 kfree(response);
2582 kfree(object_name);
2583
2584 return ret;
2585}
2586
59c2be1e
YS
2587static ssize_t rbd_add(struct bus_type *bus,
2588 const char *buf,
2589 size_t count)
602adf40 2590{
cb8627c7
AE
2591 char *options;
2592 struct rbd_device *rbd_dev = NULL;
7ef3214a
AE
2593 const char *mon_addrs = NULL;
2594 size_t mon_addrs_size = 0;
27cc2594
AE
2595 struct ceph_osd_client *osdc;
2596 int rc = -ENOMEM;
3feeb894 2597 char *snap_name;
602adf40
YS
2598
2599 if (!try_module_get(THIS_MODULE))
2600 return -ENODEV;
2601
60571c7d 2602 options = kmalloc(count, GFP_KERNEL);
602adf40 2603 if (!options)
85ae8926 2604 goto err_out_mem;
cb8627c7
AE
2605 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2606 if (!rbd_dev)
85ae8926 2607 goto err_out_mem;
602adf40
YS
2608
2609 /* static rbd_device initialization */
2610 spin_lock_init(&rbd_dev->lock);
2611 INIT_LIST_HEAD(&rbd_dev->node);
dfc5606d 2612 INIT_LIST_HEAD(&rbd_dev->snaps);
c666601a 2613 init_rwsem(&rbd_dev->header_rwsem);
602adf40 2614
602adf40 2615 /* parse add command */
3feeb894
AE
2616 snap_name = rbd_add_parse_args(rbd_dev, buf,
2617 &mon_addrs, &mon_addrs_size, options, count);
2618 if (IS_ERR(snap_name)) {
2619 rc = PTR_ERR(snap_name);
85ae8926 2620 goto err_out_mem;
3feeb894 2621 }
e124a82f 2622
f8c38929
AE
2623 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
2624 if (rc < 0)
85ae8926 2625 goto err_out_args;
602adf40 2626
602adf40 2627 /* pick the pool */
1dbb4399 2628 osdc = &rbd_dev->rbd_client->client->osdc;
602adf40
YS
2629 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2630 if (rc < 0)
2631 goto err_out_client;
9bb2f334 2632 rbd_dev->pool_id = rc;
602adf40 2633
589d30e0
AE
2634 rc = rbd_dev_image_id(rbd_dev);
2635 if (!rc) {
2636 rc = -ENOTSUPP; /* Not actually supporting format 2 yet */
2637 goto err_out_client;
2638 }
2639
2640 /* Version 1 images have no id; empty string is used */
2641
2642 rbd_dev->image_id = kstrdup("", GFP_KERNEL);
2643 if (!rbd_dev->image_id) {
2644 rc = -ENOMEM;
2645 goto err_out_client;
2646 }
2647 rbd_dev->image_id_len = 0;
2648
3fcf2581
AE
2649 /* Create the name of the header object */
2650
2651 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
2652 + sizeof (RBD_SUFFIX),
2653 GFP_KERNEL);
2654 if (!rbd_dev->header_name)
2655 goto err_out_client;
2656 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
2657
05fd6f6f
AE
2658 /* Get information about the image being mapped */
2659
2660 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
2661 if (rc)
2662 goto err_out_client;
2663
2664 /* no need to lock here, as rbd_dev is not registered yet */
2665 rc = rbd_dev_snaps_update(rbd_dev);
2666 if (rc)
2667 goto err_out_header;
2668
2669 rc = rbd_dev_set_mapping(rbd_dev, snap_name);
2670 if (rc)
2671 goto err_out_header;
2672
85ae8926
AE
2673 /* generate unique id: find highest unique id, add one */
2674 rbd_dev_id_get(rbd_dev);
2675
2676 /* Fill in the device name, now that we have its id. */
2677 BUILD_BUG_ON(DEV_NAME_LEN
2678 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2679 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
2680
2681 /* Get our block major device number. */
2682
27cc2594
AE
2683 rc = register_blkdev(0, rbd_dev->name);
2684 if (rc < 0)
85ae8926 2685 goto err_out_id;
27cc2594 2686 rbd_dev->major = rc;
602adf40 2687
0f308a31
AE
2688 /* Set up the blkdev mapping. */
2689
2690 rc = rbd_init_disk(rbd_dev);
dfc5606d 2691 if (rc)
766fc439
YS
2692 goto err_out_blkdev;
2693
0f308a31
AE
2694 rc = rbd_bus_add_dev(rbd_dev);
2695 if (rc)
2696 goto err_out_disk;
2697
32eec68d
AE
2698 /*
2699 * At this point cleanup in the event of an error is the job
2700 * of the sysfs code (initiated by rbd_bus_del_dev()).
32eec68d 2701 */
2ac4e75d 2702
4bb1f1ed 2703 down_write(&rbd_dev->header_rwsem);
5ed16177 2704 rc = rbd_dev_snaps_register(rbd_dev);
4bb1f1ed 2705 up_write(&rbd_dev->header_rwsem);
2ac4e75d
AE
2706 if (rc)
2707 goto err_out_bus;
2708
3ee4001e
AE
2709 rc = rbd_init_watch_dev(rbd_dev);
2710 if (rc)
2711 goto err_out_bus;
2712
2ac4e75d
AE
2713 /* Everything's ready. Announce the disk to the world. */
2714
2ac4e75d 2715 add_disk(rbd_dev->disk);
3ee4001e 2716
2ac4e75d
AE
2717 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
2718 (unsigned long long) rbd_dev->mapping.size);
2719
602adf40
YS
2720 return count;
2721
766fc439 2722err_out_bus:
766fc439
YS
2723 /* this will also clean up rest of rbd_dev stuff */
2724
2725 rbd_bus_del_dev(rbd_dev);
2726 kfree(options);
766fc439
YS
2727 return rc;
2728
0f308a31
AE
2729err_out_disk:
2730 rbd_free_disk(rbd_dev);
602adf40
YS
2731err_out_blkdev:
2732 unregister_blkdev(rbd_dev->major, rbd_dev->name);
85ae8926
AE
2733err_out_id:
2734 rbd_dev_id_put(rbd_dev);
05fd6f6f
AE
2735err_out_header:
2736 rbd_header_free(&rbd_dev->header);
602adf40 2737err_out_client:
3fcf2581 2738 kfree(rbd_dev->header_name);
602adf40 2739 rbd_put_client(rbd_dev);
589d30e0 2740 kfree(rbd_dev->image_id);
85ae8926
AE
2741err_out_args:
2742 kfree(rbd_dev->mapping.snap_name);
2743 kfree(rbd_dev->image_name);
2744 kfree(rbd_dev->pool_name);
2745err_out_mem:
27cc2594 2746 kfree(rbd_dev);
cb8627c7 2747 kfree(options);
27cc2594 2748
602adf40
YS
2749 dout("Error adding device %s\n", buf);
2750 module_put(THIS_MODULE);
27cc2594
AE
2751
2752 return (ssize_t) rc;
602adf40
YS
2753}
2754
de71a297 2755static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
602adf40
YS
2756{
2757 struct list_head *tmp;
2758 struct rbd_device *rbd_dev;
2759
e124a82f 2760 spin_lock(&rbd_dev_list_lock);
602adf40
YS
2761 list_for_each(tmp, &rbd_dev_list) {
2762 rbd_dev = list_entry(tmp, struct rbd_device, node);
de71a297 2763 if (rbd_dev->dev_id == dev_id) {
e124a82f 2764 spin_unlock(&rbd_dev_list_lock);
602adf40 2765 return rbd_dev;
e124a82f 2766 }
602adf40 2767 }
e124a82f 2768 spin_unlock(&rbd_dev_list_lock);
602adf40
YS
2769 return NULL;
2770}
2771
dfc5606d 2772static void rbd_dev_release(struct device *dev)
602adf40 2773{
593a9e7b 2774 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
602adf40 2775
1dbb4399
AE
2776 if (rbd_dev->watch_request) {
2777 struct ceph_client *client = rbd_dev->rbd_client->client;
2778
2779 ceph_osdc_unregister_linger_request(&client->osdc,
59c2be1e 2780 rbd_dev->watch_request);
1dbb4399 2781 }
59c2be1e 2782 if (rbd_dev->watch_event)
070c633f 2783 rbd_req_sync_unwatch(rbd_dev);
59c2be1e 2784
602adf40
YS
2785 rbd_put_client(rbd_dev);
2786
2787 /* clean up and free blkdev */
2788 rbd_free_disk(rbd_dev);
2789 unregister_blkdev(rbd_dev->major, rbd_dev->name);
32eec68d 2790
2ac4e75d
AE
2791 /* release allocated disk header fields */
2792 rbd_header_free(&rbd_dev->header);
2793
32eec68d 2794 /* done with the id, and with the rbd_dev */
f84344f3 2795 kfree(rbd_dev->mapping.snap_name);
589d30e0 2796 kfree(rbd_dev->image_id);
0bed54dc 2797 kfree(rbd_dev->header_name);
d22f76e7 2798 kfree(rbd_dev->pool_name);
0bed54dc 2799 kfree(rbd_dev->image_name);
e2839308 2800 rbd_dev_id_put(rbd_dev);
602adf40
YS
2801 kfree(rbd_dev);
2802
2803 /* release module ref */
2804 module_put(THIS_MODULE);
602adf40
YS
2805}
2806
dfc5606d
YS
2807static ssize_t rbd_remove(struct bus_type *bus,
2808 const char *buf,
2809 size_t count)
602adf40
YS
2810{
2811 struct rbd_device *rbd_dev = NULL;
2812 int target_id, rc;
2813 unsigned long ul;
2814 int ret = count;
2815
2816 rc = strict_strtoul(buf, 10, &ul);
2817 if (rc)
2818 return rc;
2819
2820 /* convert to int; abort if we lost anything in the conversion */
2821 target_id = (int) ul;
2822 if (target_id != ul)
2823 return -EINVAL;
2824
2825 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2826
2827 rbd_dev = __rbd_get_dev(target_id);
2828 if (!rbd_dev) {
2829 ret = -ENOENT;
2830 goto done;
2831 }
2832
dfc5606d
YS
2833 __rbd_remove_all_snaps(rbd_dev);
2834 rbd_bus_del_dev(rbd_dev);
602adf40
YS
2835
2836done:
2837 mutex_unlock(&ctl_mutex);
aafb230e 2838
602adf40
YS
2839 return ret;
2840}
2841
602adf40
YS
2842/*
2843 * create control files in sysfs
dfc5606d 2844 * /sys/bus/rbd/...
602adf40
YS
2845 */
2846static int rbd_sysfs_init(void)
2847{
dfc5606d 2848 int ret;
602adf40 2849
fed4c143 2850 ret = device_register(&rbd_root_dev);
21079786 2851 if (ret < 0)
dfc5606d 2852 return ret;
602adf40 2853
fed4c143
AE
2854 ret = bus_register(&rbd_bus_type);
2855 if (ret < 0)
2856 device_unregister(&rbd_root_dev);
602adf40 2857
602adf40
YS
2858 return ret;
2859}
2860
2861static void rbd_sysfs_cleanup(void)
2862{
dfc5606d 2863 bus_unregister(&rbd_bus_type);
fed4c143 2864 device_unregister(&rbd_root_dev);
602adf40
YS
2865}
2866
2867int __init rbd_init(void)
2868{
2869 int rc;
2870
2871 rc = rbd_sysfs_init();
2872 if (rc)
2873 return rc;
f0f8cef5 2874 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
602adf40
YS
2875 return 0;
2876}
2877
2878void __exit rbd_exit(void)
2879{
2880 rbd_sysfs_cleanup();
2881}
2882
2883module_init(rbd_init);
2884module_exit(rbd_exit);
2885
2886MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2887MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2888MODULE_DESCRIPTION("rados block device");
2889
2890/* following authorship retained from original osdblk.c */
2891MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2892
2893MODULE_LICENSE("GPL");