]> git.ipfire.org Git - thirdparty/mdadm.git/blob - managemon.c
Remove getinfo_super_n and do some other cleaning up.
[thirdparty/mdadm.git] / managemon.c
1
2 /*
3 * The management thread for monitoring active md arrays.
4 * This thread does things which might block such as memory
5 * allocation.
6 * In particular:
7 *
8 * - Find out about new arrays in this container.
9 * Allocate the data structures and open the files.
10 *
11 * For this we watch /proc/mdstat and find new arrays with
12 * metadata type that confirms sharing. e.g. "md4"
13 * When we find a new array we slip it into the list of
14 * arrays and signal 'monitor' by writing to a pipe.
15 *
16 * - Respond to reshape requests by allocating new data structures
17 * and opening new files.
18 *
19 * These come as a change to raid_disks. We allocate a new
20 * version of the data structures and slip it into the list.
21 * 'monitor' will notice and release the old version.
22 * Changes to level, chunksize, layout.. do not need re-allocation.
23 * Reductions in raid_disks don't really either, but we handle
24 * them the same way for consistency.
25 *
26 * - When a device is added to the container, we add it to the metadata
27 * as a spare.
28 *
29 * - Deal with degraded array
30 * We only do this when first noticing the array is degraded.
31 * This can be when we first see the array, when sync completes or
32 * when recovery completes.
33 *
34 * Check if number of failed devices suggests recovery is needed, and
35 * skip if not.
36 * Ask metadata to allocate a spare device
37 * Add device as not in_sync and give a role
38 * Update metadata.
39 * Open sysfs files and pass to monitor.
40 * Make sure that monitor Starts recovery....
41 *
42 * - Pass on metadata updates from external programs such as
43 * mdadm creating a new array.
44 *
45 * This is most-messy.
46 * It might involve adding a new array or changing the status of
47 * a spare, or any reconfig that the kernel doesn't get involved in.
48 *
49 * The required updates are received via a named pipe. There will
50 * be one named pipe for each container. Each message contains a
51 * sync marker: 0x5a5aa5a5, A byte count, and the message. This is
52 * passed to the metadata handler which will interpret and process it.
53 * For 'DDF' messages are internal data blocks with the leading
54 * 'magic number' signifying what sort of data it is.
55 *
56 */
57
58 /*
59 * We select on /proc/mdstat and the named pipe.
60 * We create new arrays or updated version of arrays and slip
61 * them into the head of the list, then signal 'monitor' via a pipe write.
62 * 'monitor' will notice and place the old array on a return list.
63 * Metadata updates are placed on a queue just like they arrive
64 * from the named pipe.
65 *
66 * When new arrays are found based on correct metadata string, we
67 * need to identify them with an entry in the metadata. Maybe we require
68 * the metadata to be mdX/NN when NN is the index into an appropriate table.
69 *
70 */
71
72 /*
73 * List of tasks:
74 * - Watch for spares to be added to the container, and write updated
75 * metadata to them.
76 * - Watch for new arrays using this container, confirm they match metadata
77 * and if so, start monitoring them
78 * - Watch for spares being added to monitored arrays. This shouldn't
79 * happen, as we should do all the adding. Just remove them.
80 * - Watch for change in raid-disks, chunk-size, etc. Update metadata and
81 * start a reshape.
82 */
83 #ifndef _GNU_SOURCE
84 #define _GNU_SOURCE
85 #endif
86 #include "mdadm.h"
87 #include "mdmon.h"
88 #include <sys/socket.h>
89 #include <signal.h>
90
91 static void close_aa(struct active_array *aa)
92 {
93 struct mdinfo *d;
94
95 for (d = aa->info.devs; d; d = d->next)
96 close(d->state_fd);
97
98 close(aa->action_fd);
99 close(aa->info.state_fd);
100 close(aa->resync_start_fd);
101 }
102
103 static void free_aa(struct active_array *aa)
104 {
105 /* Note that this doesn't close fds if they are being used
106 * by a clone. ->container will be set for a clone
107 */
108 dprintf("%s: devnum: %d\n", __func__, aa->devnum);
109 if (!aa->container)
110 close_aa(aa);
111 while (aa->info.devs) {
112 struct mdinfo *d = aa->info.devs;
113 aa->info.devs = d->next;
114 free(d);
115 }
116 free(aa);
117 }
118
119 static struct active_array *duplicate_aa(struct active_array *aa)
120 {
121 struct active_array *newa = malloc(sizeof(*newa));
122 struct mdinfo **dp1, **dp2;
123
124 *newa = *aa;
125 newa->next = NULL;
126 newa->replaces = NULL;
127 newa->info.next = NULL;
128
129 dp2 = &newa->info.devs;
130
131 for (dp1 = &aa->info.devs; *dp1; dp1 = &(*dp1)->next) {
132 struct mdinfo *d;
133 if ((*dp1)->state_fd < 0)
134 continue;
135
136 d = malloc(sizeof(*d));
137 *d = **dp1;
138 *dp2 = d;
139 dp2 = & d->next;
140 }
141 *dp2 = NULL;
142
143 return newa;
144 }
145
146 static void write_wakeup(struct supertype *c)
147 {
148 static struct md_generic_cmd cmd = { .action = md_action_ping_monitor };
149 int err;
150
151 active_cmd = &cmd;
152
153 /* send the monitor thread a pointer to the ping action */
154 write(c->mgr_pipe[1], &err, 1);
155 read(c->mon_pipe[0], &err, 1);
156 }
157
158 static void remove_old(void)
159 {
160 if (discard_this) {
161 discard_this->next = NULL;
162 free_aa(discard_this);
163 if (pending_discard == discard_this)
164 pending_discard = NULL;
165 discard_this = NULL;
166 }
167 }
168
169 static void replace_array(struct supertype *container,
170 struct active_array *old,
171 struct active_array *new)
172 {
173 /* To replace an array, we add it to the top of the list
174 * marked with ->replaces to point to the original.
175 * 'monitor' will take the original out of the list
176 * and put it on 'discard_this'. We take it from there
177 * and discard it.
178 */
179 remove_old();
180 while (pending_discard) {
181 write_wakeup(container);
182 while (discard_this == NULL)
183 sleep(1);
184 remove_old();
185 }
186 pending_discard = old;
187 new->replaces = old;
188 new->next = container->arrays;
189 container->arrays = new;
190 write_wakeup(container);
191 }
192
193 struct metadata_update *update_queue = NULL;
194 struct metadata_update *update_queue_handled = NULL;
195 struct metadata_update *update_queue_pending = NULL;
196
197 void check_update_queue(struct supertype *container)
198 {
199 while (update_queue_handled) {
200 struct metadata_update *this = update_queue_handled;
201 update_queue_handled = this->next;
202 free(this->buf);
203 if (this->space)
204 free(this->space);
205 free(this);
206 }
207 if (update_queue == NULL &&
208 update_queue_pending) {
209 update_queue = update_queue_pending;
210 update_queue_pending = NULL;
211 write_wakeup(container);
212 }
213 }
214
215 static void queue_metadata_update(struct metadata_update *mu)
216 {
217 struct metadata_update **qp;
218
219 qp = &update_queue_pending;
220 while (*qp)
221 qp = & ((*qp)->next);
222 *qp = mu;
223 }
224
225 void wait_update_handled(void)
226 {
227 /* Wait for any pending update to be handled by monitor.
228 * i.e. wait until update_queue is NULL
229 */
230 while (update_queue)
231 usleep(100 * 1000);
232 }
233
234 static void manage_container(struct mdstat_ent *mdstat,
235 struct supertype *container)
236 {
237 /* The only thing of interest here is if a new device
238 * has been added to the container. We add it to the
239 * array ignoring any metadata on it.
240 * FIXME should we look for compatible metadata and take hints
241 * about spare assignment.... probably not.
242 */
243 if (mdstat->devcnt != container->devcnt) {
244 /* read /sys/block/NAME/md/dev-??/block/dev to find out
245 * what is there, and compare with container->info.devs
246 * To see what is removed and what is added.
247 * These need to be remove from, or added to, the array
248 */
249 // FIXME
250 container->devcnt = mdstat->devcnt;
251 }
252 }
253
254 static void manage_member(struct mdstat_ent *mdstat,
255 struct active_array *a)
256 {
257 /* Compare mdstat info with known state of member array.
258 * We do not need to look for device state changes here, that
259 * is dealt with by the monitor.
260 *
261 * We just look for changes which suggest that a reshape is
262 * being requested.
263 * Unfortunately decreases in raid_disks don't show up in
264 * mdstat until the reshape completes FIXME.
265 *
266 * Actually, we also want to handle degraded arrays here by
267 * trying to find and assign a spare.
268 * We do that whenever the monitor tells us too.
269 */
270 // FIXME
271 a->info.array.raid_disks = mdstat->raid_disks;
272 a->info.array.chunk_size = mdstat->chunk_size;
273 // MORE
274
275 if (a->check_degraded) {
276 struct metadata_update *updates = NULL;
277 struct mdinfo *newdev;
278 struct active_array *newa;
279 wait_update_handled();
280 a->check_degraded = 0;
281
282 /* The array may not be degraded, this is just a good time
283 * to check.
284 */
285 newdev = a->container->ss->activate_spare(a, &updates);
286 if (newdev) {
287 struct mdinfo *d;
288 /* Cool, we can add a device or several. */
289 newa = duplicate_aa(a);
290 /* suspend recovery - maybe not needed */
291
292 /* Add device to array and set offset/size/slot.
293 * and open files for each newdev */
294 for (d = newdev; d ; d = d->next) {
295 struct mdinfo *newd;
296 if (sysfs_add_disk(&newa->info, d))
297 continue;
298 newd = newa->info.devs;
299 newd->state_fd = sysfs_open(a->devnum,
300 newd->sys_name,
301 "state");
302 newd->prev_state
303 = read_dev_state(newd->state_fd);
304 newd->curr_state = newd->prev_state;
305 }
306 queue_metadata_update(updates);
307 replace_array(a->container, a, newa);
308 sysfs_set_str(&a->info, NULL, "sync_action", "repair");
309 }
310 }
311 }
312
313 static void manage_new(struct mdstat_ent *mdstat,
314 struct supertype *container,
315 struct active_array *victim)
316 {
317 /* A new array has appeared in this container.
318 * Hopefully it is already recorded in the metadata.
319 * Check, then create the new array to report it to
320 * the monitor.
321 */
322
323 struct active_array *new;
324 struct mdinfo *mdi, *di;
325 char *inst;
326 int i;
327
328 new = malloc(sizeof(*new));
329
330 memset(new, 0, sizeof(*new));
331
332 new->devnum = mdstat->devnum;
333 strcpy(new->info.sys_name, devnum2devname(new->devnum));
334
335 new->prev_state = new->curr_state = new->next_state = inactive;
336 new->prev_action= new->curr_action= new->next_action= idle;
337
338 new->container = container;
339
340 inst = &mdstat->metadata_version[10+strlen(container->devname)+1];
341
342 mdi = sysfs_read(-1, new->devnum,
343 GET_LEVEL|GET_CHUNK|GET_DISKS|
344 GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE);
345 if (!mdi) {
346 /* Eeek. Cannot monitor this array.
347 * Mark it to be ignored by setting container to NULL
348 */
349 new->container = NULL;
350 replace_array(container, victim, new);
351 return;
352 }
353
354 new->info.array = mdi->array;
355
356 for (i = 0; i < new->info.array.raid_disks; i++) {
357 struct mdinfo *newd = malloc(sizeof(*newd));
358
359 for (di = mdi->devs; di; di = di->next)
360 if (i == di->disk.raid_disk)
361 break;
362
363 if (di) {
364 memcpy(newd, di, sizeof(*newd));
365
366 newd->state_fd = sysfs_open(new->devnum,
367 newd->sys_name,
368 "state");
369
370 newd->prev_state = read_dev_state(newd->state_fd);
371 newd->curr_state = newd->prev_state;
372 } else {
373 newd->state_fd = -1;
374 newd->disk.raid_disk = i;
375 newd->prev_state = DS_REMOVE;
376 newd->curr_state = DS_REMOVE;
377 }
378 sprintf(newd->sys_name, "rd%d", i);
379 newd->next = new->info.devs;
380 new->info.devs = newd;
381 }
382 new->action_fd = sysfs_open(new->devnum, NULL, "sync_action");
383 new->info.state_fd = sysfs_open(new->devnum, NULL, "array_state");
384 new->resync_start_fd = sysfs_open(new->devnum, NULL, "resync_start");
385 new->resync_start = 0;
386 dprintf("%s: inst: %d action: %d state: %d\n", __func__, atoi(inst),
387 new->action_fd, new->info.state_fd);
388
389 sysfs_free(mdi);
390 // finds and compares.
391 if (container->ss->open_new(container, new, inst) < 0) {
392 // FIXME close all those files
393 new->container = NULL;
394 replace_array(container, victim, new);
395 return;
396 }
397 replace_array(container, victim, new);
398 return;
399 }
400
401 void manage(struct mdstat_ent *mdstat, struct supertype *container)
402 {
403 /* We have just read mdstat and need to compare it with
404 * the known active arrays.
405 * Arrays with the wrong metadata are ignored.
406 */
407
408 for ( ; mdstat ; mdstat = mdstat->next) {
409 struct active_array *a;
410 if (mdstat->devnum == container->devnum) {
411 manage_container(mdstat, container);
412 continue;
413 }
414 if (mdstat->metadata_version == NULL ||
415 strncmp(mdstat->metadata_version, "external:/", 10) != 0 ||
416 strncmp(mdstat->metadata_version+10, container->devname,
417 strlen(container->devname)) != 0 ||
418 mdstat->metadata_version[10+strlen(container->devname)]
419 != '/')
420 /* Not for this array */
421 continue;
422 /* Looks like a member of this container */
423 for (a = container->arrays; a; a = a->next) {
424 if (mdstat->devnum == a->devnum) {
425 if (a->container)
426 manage_member(mdstat, a);
427 break;
428 }
429 }
430 if (a == NULL || !a->container)
431 manage_new(mdstat, container, a);
432 }
433 }
434
435 static int handle_message(struct supertype *container, struct md_message *msg)
436 {
437 int err;
438 struct md_generic_cmd *cmd = msg->buf;
439
440 if (!cmd)
441 return 0;
442
443 switch (cmd->action) {
444 case md_action_remove_device:
445
446 /* forward to the monitor */
447 active_cmd = cmd;
448 write(container->mgr_pipe[1], &err, 1);
449 read(container->mon_pipe[0], &err, 1);
450 return err;
451
452 default:
453 return -1;
454 }
455 }
456
457 void read_sock(struct supertype *container)
458 {
459 int fd;
460 struct md_message msg;
461 int terminate = 0;
462 long fl;
463 int tmo = 3; /* 3 second timeout before hanging up the socket */
464
465 fd = accept(container->sock, NULL, NULL);
466 if (fd < 0)
467 return;
468
469 fl = fcntl(fd, F_GETFL, 0);
470 fl |= O_NONBLOCK;
471 fcntl(fd, F_SETFL, fl);
472
473 do {
474 int err;
475
476 msg.buf = NULL;
477
478 /* read and validate the message */
479 if (receive_message(fd, &msg, tmo) == 0) {
480 err = handle_message(container, &msg);
481 if (!err)
482 ack(fd, msg.seq, tmo);
483 else
484 nack(fd, err, tmo);
485 } else {
486 terminate = 1;
487 nack(fd, -1, tmo);
488 }
489
490 if (msg.buf)
491 free(msg.buf);
492 } while (!terminate);
493
494 close(fd);
495 }
496
497 static int woke = 0;
498 void wake_me(int sig)
499 {
500 woke = 1;
501 }
502
503 int exit_now = 0;
504 int manager_ready = 0;
505 void do_manager(struct supertype *container)
506 {
507 struct mdstat_ent *mdstat;
508 sigset_t block, orig;
509
510 sigemptyset(&block);
511 sigaddset(&block, SIGUSR1);
512
513 signal(SIGUSR1, wake_me);
514
515 do {
516 woke = 0;
517
518 if (exit_now)
519 exit(0);
520
521 mdstat = mdstat_read(1, 0);
522
523 manage(mdstat, container);
524
525 read_sock(container);
526
527 free_mdstat(mdstat);
528
529 remove_old();
530
531 check_update_queue(container);
532
533 manager_ready = 1;
534 sigprocmask(SIG_SETMASK, &block, &orig);
535 if (woke == 0)
536 mdstat_wait_fd(container->sock, &orig);
537 sigprocmask(SIG_SETMASK, &orig, NULL);
538 } while(1);
539 }