]> git.ipfire.org Git - thirdparty/mdadm.git/blame - managemon.c
Add some comments to explain some of the bits of superswitch.
[thirdparty/mdadm.git] / managemon.c
CommitLineData
549e9569
NB
1
2/*
3 * The management thread for monitoring active md arrays.
4 * This thread does things which might block such as memory
5 * allocation.
6 * In particular:
7 *
8 * - Find out about new arrays in this container.
9 * Allocate the data structures and open the files.
10 *
11 * For this we watch /proc/mdstat and find new arrays with
12 * metadata type that confirms sharing. e.g. "md4"
13 * When we find a new array we slip it into the list of
14 * arrays and signal 'monitor' by writing to a pipe.
15 *
16 * - Respond to reshape requests by allocating new data structures
17 * and opening new files.
18 *
19 * These come as a change to raid_disks. We allocate a new
20 * version of the data structures and slip it into the list.
21 * 'monitor' will notice and release the old version.
22 * Changes to level, chunksize, layout.. do not need re-allocation.
23 * Reductions in raid_disks don't really either, but we handle
24 * them the same way for consistency.
25 *
26 * - When a device is added to the container, we add it to the metadata
27 * as a spare.
28 *
6c3fb95c
NB
29 * - Deal with degraded array
30 * We only do this when first noticing the array is degraded.
31 * This can be when we first see the array, when sync completes or
32 * when recovery completes.
33 *
34 * Check if number of failed devices suggests recovery is needed, and
35 * skip if not.
36 * Ask metadata to allocate a spare device
37 * Add device as not in_sync and give a role
38 * Update metadata.
39 * Open sysfs files and pass to monitor.
40 * Make sure that monitor Starts recovery....
549e9569
NB
41 *
42 * - Pass on metadata updates from external programs such as
43 * mdadm creating a new array.
44 *
45 * This is most-messy.
46 * It might involve adding a new array or changing the status of
47 * a spare, or any reconfig that the kernel doesn't get involved in.
48 *
49 * The required updates are received via a named pipe. There will
50 * be one named pipe for each container. Each message contains a
51 * sync marker: 0x5a5aa5a5, A byte count, and the message. This is
52 * passed to the metadata handler which will interpret and process it.
53 * For 'DDF' messages are internal data blocks with the leading
54 * 'magic number' signifying what sort of data it is.
55 *
56 */
57
58/*
59 * We select on /proc/mdstat and the named pipe.
60 * We create new arrays or updated version of arrays and slip
61 * them into the head of the list, then signal 'monitor' via a pipe write.
62 * 'monitor' will notice and place the old array on a return list.
63 * Metadata updates are placed on a queue just like they arrive
64 * from the named pipe.
65 *
66 * When new arrays are found based on correct metadata string, we
67 * need to identify them with an entry in the metadata. Maybe we require
68 * the metadata to be mdX/NN when NN is the index into an appropriate table.
69 *
70 */
71
72/*
73 * List of tasks:
74 * - Watch for spares to be added to the container, and write updated
75 * metadata to them.
76 * - Watch for new arrays using this container, confirm they match metadata
77 * and if so, start monitoring them
78 * - Watch for spares being added to monitored arrays. This shouldn't
79 * happen, as we should do all the adding. Just remove them.
80 * - Watch for change in raid-disks, chunk-size, etc. Update metadata and
81 * start a reshape.
82 */
83#ifndef _GNU_SOURCE
84#define _GNU_SOURCE
85#endif
86#include "mdadm.h"
87#include "mdmon.h"
88#include <sys/socket.h>
1ed3f387 89#include <signal.h>
549e9569 90
2a0bb19e
DW
91static void close_aa(struct active_array *aa)
92{
93 struct mdinfo *d;
94
95 for (d = aa->info.devs; d; d = d->next)
96 close(d->state_fd);
97
98 close(aa->action_fd);
99 close(aa->info.state_fd);
100 close(aa->resync_start_fd);
2a0bb19e
DW
101}
102
549e9569
NB
103static void free_aa(struct active_array *aa)
104{
2a0bb19e
DW
105 /* Note that this doesn't close fds if they are being used
106 * by a clone. ->container will be set for a clone
549e9569 107 */
4e6e574a 108 dprintf("%s: devnum: %d\n", __func__, aa->devnum);
2a0bb19e
DW
109 if (!aa->container)
110 close_aa(aa);
549e9569
NB
111 while (aa->info.devs) {
112 struct mdinfo *d = aa->info.devs;
113 aa->info.devs = d->next;
114 free(d);
115 }
116 free(aa);
117}
118
6c3fb95c
NB
119static struct active_array *duplicate_aa(struct active_array *aa)
120{
121 struct active_array *newa = malloc(sizeof(*newa));
122 struct mdinfo **dp1, **dp2;
123
124 *newa = *aa;
125 newa->next = NULL;
126 newa->replaces = NULL;
127 newa->info.next = NULL;
128
129 dp2 = &newa->info.devs;
130
131 for (dp1 = &aa->info.devs; *dp1; dp1 = &(*dp1)->next) {
132 struct mdinfo *d;
133 if ((*dp1)->state_fd < 0)
134 continue;
135
136 d = malloc(sizeof(*d));
137 *d = **dp1;
138 *dp2 = d;
139 dp2 = & d->next;
140 }
7e1432fb 141 *dp2 = NULL;
6c3fb95c
NB
142
143 return newa;
144}
145
2a0bb19e
DW
146static void write_wakeup(struct supertype *c)
147{
3e70c845
DW
148 static struct md_generic_cmd cmd = { .action = md_action_ping_monitor };
149 int err;
150
151 active_cmd = &cmd;
152
153 /* send the monitor thread a pointer to the ping action */
154 write(c->mgr_pipe[1], &err, 1);
155 read(c->mon_pipe[0], &err, 1);
2a0bb19e
DW
156}
157
1ed3f387
NB
158static void remove_old(void)
159{
160 if (discard_this) {
161 discard_this->next = NULL;
162 free_aa(discard_this);
163 if (pending_discard == discard_this)
164 pending_discard = NULL;
165 discard_this = NULL;
166 }
167}
168
549e9569
NB
169static void replace_array(struct supertype *container,
170 struct active_array *old,
171 struct active_array *new)
172{
173 /* To replace an array, we add it to the top of the list
174 * marked with ->replaces to point to the original.
175 * 'monitor' will take the original out of the list
176 * and put it on 'discard_this'. We take it from there
177 * and discard it.
178 */
1ed3f387 179 remove_old();
549e9569 180 while (pending_discard) {
1ed3f387 181 write_wakeup(container);
549e9569
NB
182 while (discard_this == NULL)
183 sleep(1);
1ed3f387 184 remove_old();
549e9569
NB
185 }
186 pending_discard = old;
187 new->replaces = old;
188 new->next = container->arrays;
189 container->arrays = new;
2a0bb19e 190 write_wakeup(container);
549e9569
NB
191}
192
2e735d19
NB
193struct metadata_update *update_queue = NULL;
194struct metadata_update *update_queue_handled = NULL;
195struct metadata_update *update_queue_pending = NULL;
196
197void check_update_queue(struct supertype *container)
198{
199 while (update_queue_handled) {
200 struct metadata_update *this = update_queue_handled;
201 update_queue_handled = this->next;
904c1ef7
NB
202 free(this->buf);
203 if (this->space)
204 free(this->space);
2e735d19
NB
205 free(this);
206 }
207 if (update_queue == NULL &&
208 update_queue_pending) {
209 update_queue = update_queue_pending;
210 update_queue_pending = NULL;
211 write_wakeup(container);
212 }
213}
214
6c3fb95c 215static void queue_metadata_update(struct metadata_update *mu)
2e735d19
NB
216{
217 struct metadata_update **qp;
218
219 qp = &update_queue_pending;
220 while (*qp)
221 qp = & ((*qp)->next);
222 *qp = mu;
223}
224
225void wait_update_handled(void)
226{
227 /* Wait for any pending update to be handled by monitor.
228 * i.e. wait until update_queue is NULL
229 */
230 while (update_queue)
231 usleep(100 * 1000);
232}
233
549e9569
NB
234static void manage_container(struct mdstat_ent *mdstat,
235 struct supertype *container)
236{
237 /* The only thing of interest here is if a new device
238 * has been added to the container. We add it to the
239 * array ignoring any metadata on it.
240 * FIXME should we look for compatible metadata and take hints
241 * about spare assignment.... probably not.
549e9569
NB
242 */
243 if (mdstat->devcnt != container->devcnt) {
244 /* read /sys/block/NAME/md/dev-??/block/dev to find out
245 * what is there, and compare with container->info.devs
246 * To see what is removed and what is added.
247 * These need to be remove from, or added to, the array
248 */
249 // FIXME
250 container->devcnt = mdstat->devcnt;
251 }
252}
253
254static void manage_member(struct mdstat_ent *mdstat,
255 struct active_array *a)
256{
257 /* Compare mdstat info with known state of member array.
258 * We do not need to look for device state changes here, that
259 * is dealt with by the monitor.
260 *
261 * We just look for changes which suggest that a reshape is
262 * being requested.
263 * Unfortunately decreases in raid_disks don't show up in
264 * mdstat until the reshape completes FIXME.
6c3fb95c
NB
265 *
266 * Actually, we also want to handle degraded arrays here by
267 * trying to find and assign a spare.
268 * We do that whenever the monitor tells us too.
549e9569
NB
269 */
270 // FIXME
271 a->info.array.raid_disks = mdstat->raid_disks;
272 a->info.array.chunk_size = mdstat->chunk_size;
273 // MORE
274
6c3fb95c
NB
275 if (a->check_degraded) {
276 struct metadata_update *updates = NULL;
277 struct mdinfo *newdev;
278 struct active_array *newa;
279 wait_update_handled();
280 a->check_degraded = 0;
281
282 /* The array may not be degraded, this is just a good time
283 * to check.
284 */
285 newdev = a->container->ss->activate_spare(a, &updates);
286 if (newdev) {
287 struct mdinfo *d;
288 /* Cool, we can add a device or several. */
289 newa = duplicate_aa(a);
290 /* suspend recovery - maybe not needed */
291
292 /* Add device to array and set offset/size/slot.
293 * and open files for each newdev */
294 for (d = newdev; d ; d = d->next) {
295 struct mdinfo *newd;
296 if (sysfs_add_disk(&newa->info, d))
297 continue;
298 newd = newa->info.devs;
299 newd->state_fd = sysfs_open(a->devnum,
300 newd->sys_name,
301 "state");
302 newd->prev_state
303 = read_dev_state(newd->state_fd);
304 newd->curr_state = newd->prev_state;
305 }
306 queue_metadata_update(updates);
307 replace_array(a->container, a, newa);
308 sysfs_set_str(&a->info, NULL, "sync_action", "repair");
309 }
310 }
549e9569
NB
311}
312
549e9569 313static void manage_new(struct mdstat_ent *mdstat,
2a0bb19e
DW
314 struct supertype *container,
315 struct active_array *victim)
549e9569
NB
316{
317 /* A new array has appeared in this container.
318 * Hopefully it is already recorded in the metadata.
319 * Check, then create the new array to report it to
320 * the monitor.
321 */
322
323 struct active_array *new;
324 struct mdinfo *mdi, *di;
cba0191b 325 char *inst;
549e9569
NB
326 int i;
327
328 new = malloc(sizeof(*new));
329
d52690ac
NB
330 memset(new, 0, sizeof(*new));
331
549e9569 332 new->devnum = mdstat->devnum;
7e1432fb 333 strcpy(new->info.sys_name, devnum2devname(new->devnum));
549e9569
NB
334
335 new->prev_state = new->curr_state = new->next_state = inactive;
336 new->prev_action= new->curr_action= new->next_action= idle;
337
338 new->container = container;
339
cba0191b 340 inst = &mdstat->metadata_version[10+strlen(container->devname)+1];
549e9569
NB
341
342 mdi = sysfs_read(-1, new->devnum,
343 GET_LEVEL|GET_CHUNK|GET_DISKS|
344 GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE);
345 if (!mdi) {
346 /* Eeek. Cannot monitor this array.
347 * Mark it to be ignored by setting container to NULL
348 */
349 new->container = NULL;
2a0bb19e 350 replace_array(container, victim, new);
549e9569
NB
351 return;
352 }
353
354 new->info.array = mdi->array;
355
356 for (i = 0; i < new->info.array.raid_disks; i++) {
357 struct mdinfo *newd = malloc(sizeof(*newd));
358
359 for (di = mdi->devs; di; di = di->next)
360 if (i == di->disk.raid_disk)
361 break;
362
363 if (di) {
364 memcpy(newd, di, sizeof(*newd));
365
549e9569
NB
366 newd->state_fd = sysfs_open(new->devnum,
367 newd->sys_name,
368 "state");
369
370 newd->prev_state = read_dev_state(newd->state_fd);
6c3fb95c 371 newd->curr_state = newd->prev_state;
549e9569
NB
372 } else {
373 newd->state_fd = -1;
7e1432fb
NB
374 newd->disk.raid_disk = i;
375 newd->prev_state = DS_REMOVE;
376 newd->curr_state = DS_REMOVE;
549e9569 377 }
7e1432fb 378 sprintf(newd->sys_name, "rd%d", i);
549e9569
NB
379 newd->next = new->info.devs;
380 new->info.devs = newd;
381 }
382 new->action_fd = sysfs_open(new->devnum, NULL, "sync_action");
383 new->info.state_fd = sysfs_open(new->devnum, NULL, "array_state");
c052ba30 384 new->resync_start_fd = sysfs_open(new->devnum, NULL, "resync_start");
77402e51 385 new->resync_start = 0;
4e6e574a
DW
386 dprintf("%s: inst: %d action: %d state: %d\n", __func__, atoi(inst),
387 new->action_fd, new->info.state_fd);
549e9569 388
4fa5aef9 389 sysfs_free(mdi);
549e9569
NB
390 // finds and compares.
391 if (container->ss->open_new(container, new, inst) < 0) {
392 // FIXME close all those files
393 new->container = NULL;
2a0bb19e 394 replace_array(container, victim, new);
549e9569
NB
395 return;
396 }
2a0bb19e 397 replace_array(container, victim, new);
549e9569
NB
398 return;
399}
400
5d19760d 401void manage(struct mdstat_ent *mdstat, struct supertype *container)
549e9569
NB
402{
403 /* We have just read mdstat and need to compare it with
404 * the known active arrays.
405 * Arrays with the wrong metadata are ignored.
406 */
407
408 for ( ; mdstat ; mdstat = mdstat->next) {
409 struct active_array *a;
410 if (mdstat->devnum == container->devnum) {
411 manage_container(mdstat, container);
412 continue;
413 }
414 if (mdstat->metadata_version == NULL ||
415 strncmp(mdstat->metadata_version, "external:/", 10) != 0 ||
416 strncmp(mdstat->metadata_version+10, container->devname,
417 strlen(container->devname)) != 0 ||
418 mdstat->metadata_version[10+strlen(container->devname)]
419 != '/')
420 /* Not for this array */
421 continue;
422 /* Looks like a member of this container */
5d19760d 423 for (a = container->arrays; a; a = a->next) {
549e9569
NB
424 if (mdstat->devnum == a->devnum) {
425 if (a->container)
426 manage_member(mdstat, a);
427 break;
428 }
429 }
2a0bb19e
DW
430 if (a == NULL || !a->container)
431 manage_new(mdstat, container, a);
549e9569
NB
432 }
433}
434
3e70c845
DW
435static int handle_message(struct supertype *container, struct md_message *msg)
436{
437 int err;
438 struct md_generic_cmd *cmd = msg->buf;
439
440 if (!cmd)
441 return 0;
442
443 switch (cmd->action) {
444 case md_action_remove_device:
445
446 /* forward to the monitor */
447 active_cmd = cmd;
448 write(container->mgr_pipe[1], &err, 1);
449 read(container->mon_pipe[0], &err, 1);
450 return err;
451
452 default:
453 return -1;
454 }
455}
456
457void read_sock(struct supertype *container)
549e9569
NB
458{
459 int fd;
b109d928
DW
460 struct md_message msg;
461 int terminate = 0;
462 long fl;
463 int tmo = 3; /* 3 second timeout before hanging up the socket */
549e9569 464
3e70c845 465 fd = accept(container->sock, NULL, NULL);
549e9569
NB
466 if (fd < 0)
467 return;
b109d928
DW
468
469 fl = fcntl(fd, F_GETFL, 0);
470 fl |= O_NONBLOCK;
471 fcntl(fd, F_SETFL, fl);
472
473 do {
3e70c845
DW
474 int err;
475
b109d928
DW
476 msg.buf = NULL;
477
478 /* read and validate the message */
479 if (receive_message(fd, &msg, tmo) == 0) {
3e70c845
DW
480 err = handle_message(container, &msg);
481 if (!err)
482 ack(fd, msg.seq, tmo);
483 else
484 nack(fd, err, tmo);
b109d928
DW
485 } else {
486 terminate = 1;
487 nack(fd, -1, tmo);
488 }
489
490 if (msg.buf)
491 free(msg.buf);
492 } while (!terminate);
493
549e9569
NB
494 close(fd);
495}
1ed3f387
NB
496
497static int woke = 0;
498void wake_me(int sig)
499{
500 woke = 1;
501}
502
e0d6609f
NB
503int exit_now = 0;
504int manager_ready = 0;
549e9569
NB
505void do_manager(struct supertype *container)
506{
507 struct mdstat_ent *mdstat;
1ed3f387
NB
508 sigset_t block, orig;
509
510 sigemptyset(&block);
511 sigaddset(&block, SIGUSR1);
512
513 signal(SIGUSR1, wake_me);
549e9569
NB
514
515 do {
1ed3f387
NB
516 woke = 0;
517
e0d6609f
NB
518 if (exit_now)
519 exit(0);
520
549e9569
NB
521 mdstat = mdstat_read(1, 0);
522
5d19760d 523 manage(mdstat, container);
549e9569 524
3e70c845 525 read_sock(container);
549e9569 526
4fa5aef9
DW
527 free_mdstat(mdstat);
528
1ed3f387
NB
529 remove_old();
530
2e735d19
NB
531 check_update_queue(container);
532
e0d6609f 533 manager_ready = 1;
1ed3f387
NB
534 sigprocmask(SIG_SETMASK, &block, &orig);
535 if (woke == 0)
536 mdstat_wait_fd(container->sock, &orig);
537 sigprocmask(SIG_SETMASK, &orig, NULL);
549e9569
NB
538 } while(1);
539}