Release 3.0-devel2
[thirdparty/mdadm.git] / mdmon.c
CommitLineData
a54d5262
DW
1/*
2 * mdmon - monitor external metadata arrays
3 *
4 * Copyright (C) 2007-2008 Neil Brown <neilb@suse.de>
5 * Copyright (C) 2007-2008 Intel Corporation
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
19 */
549e9569
NB
20
21/*
22 * md array manager.
23 * When md arrays have user-space managed metadata, this is the program
24 * that does the managing.
25 *
26 * Given one argument: the name of the array (e.g. /dev/md0) that is
27 * the container.
28 * We fork off a helper that runs high priority and mlocked. It responds to
29 * device failures and other events that might stop writeout, or that are
30 * trivial to deal with.
31 * The main thread then watches for new arrays being created in the container
32 * and starts monitoring them too ... along with a few other tasks.
33 *
34 * The main thread communicates with the priority thread by writing over
35 * a pipe.
36 * Separate programs can communicate with the main thread via Unix-domain
37 * socket.
38 * The two threads share address space and open file table.
39 *
40 */
41
42#ifndef _GNU_SOURCE
43#define _GNU_SOURCE
44#endif
45
46#include <unistd.h>
47#include <stdlib.h>
4d43913c 48#include <sys/types.h>
549e9569
NB
49#include <sys/stat.h>
50#include <sys/socket.h>
51#include <sys/un.h>
52#include <sys/mman.h>
4d43913c 53#include <sys/syscall.h>
9fe32043 54#include <sys/wait.h>
549e9569
NB
55#include <stdio.h>
56#include <errno.h>
57#include <string.h>
58#include <fcntl.h>
b109d928 59#include <signal.h>
13047e4c 60#include <dirent.h>
549e9569
NB
61
62#include <sched.h>
63
64#include "mdadm.h"
65#include "mdmon.h"
66
549e9569
NB
67struct active_array *discard_this;
68struct active_array *pending_discard;
4d43913c
NB
69
70int mon_tid, mgr_tid;
549e9569 71
6144ed44
DW
72int sigterm;
73
549e9569
NB
74int run_child(void *v)
75{
76 struct supertype *c = v;
1ed3f387 77
549e9569
NB
78 do_monitor(c);
79 return 0;
80}
81
82int clone_monitor(struct supertype *container)
83{
549e9569 84 static char stack[4096];
549e9569 85
2cc98f9e 86 mon_tid = clone(run_child, stack+4096-64,
549e9569
NB
87 CLONE_FS|CLONE_FILES|CLONE_VM|CLONE_SIGHAND|CLONE_THREAD,
88 container);
3e70c845 89
4d43913c 90 mgr_tid = syscall(SYS_gettid);
2cc98f9e
DW
91
92 return mon_tid;
549e9569
NB
93}
94
ce744c97
DW
95static struct superswitch *find_metadata_methods(char *vers)
96{
97 if (strcmp(vers, "ddf") == 0)
98 return &super_ddf;
99 if (strcmp(vers, "imsm") == 0)
100 return &super_imsm;
101 return NULL;
102}
103
549e9569 104
295646b3 105int make_pidfile(char *devname, int o_excl)
549e9569
NB
106{
107 char path[100];
108 char pid[10];
109 int fd;
3d2c4fc7
DW
110 int n;
111
6144ed44
DW
112 if (sigterm)
113 return -1;
114
549e9569
NB
115 sprintf(path, "/var/run/mdadm/%s.pid", devname);
116
b109d928 117 fd = open(path, O_RDWR|O_CREAT|o_excl, 0600);
549e9569 118 if (fd < 0)
295646b3 119 return -errno;
549e9569 120 sprintf(pid, "%d\n", getpid());
3d2c4fc7 121 n = write(fd, pid, strlen(pid));
549e9569 122 close(fd);
3d2c4fc7
DW
123 if (n < 0)
124 return -errno;
549e9569
NB
125 return 0;
126}
127
883a6142
DW
128int is_container_member(struct mdstat_ent *mdstat, char *container)
129{
130 if (mdstat->metadata_version == NULL ||
131 strncmp(mdstat->metadata_version, "external:", 9) != 0 ||
132 !is_subarray(mdstat->metadata_version+9) ||
133 strncmp(mdstat->metadata_version+10, container, strlen(container)) != 0 ||
134 mdstat->metadata_version[10+strlen(container)] != '/')
135 return 0;
136
137 return 1;
138}
139
140void remove_pidfile(char *devname);
b109d928
DW
141static void try_kill_monitor(char *devname)
142{
143 char buf[100];
144 int fd;
145 pid_t pid;
883a6142 146 struct mdstat_ent *mdstat;
b109d928
DW
147
148 sprintf(buf, "/var/run/mdadm/%s.pid", devname);
149 fd = open(buf, O_RDONLY);
150 if (fd < 0)
151 return;
152
153 if (read(fd, buf, sizeof(buf)) < 0) {
154 close(fd);
155 return;
156 }
157
158 close(fd);
159 pid = strtoul(buf, NULL, 10);
160
8aae4219
DW
161 /* first rule of survival... don't off yourself */
162 if (pid == getpid())
163 return;
164
b109d928
DW
165 /* kill this process if it is mdmon */
166 sprintf(buf, "/proc/%lu/cmdline", (unsigned long) pid);
167 fd = open(buf, O_RDONLY);
168 if (fd < 0)
169 return;
170
171 if (read(fd, buf, sizeof(buf)) < 0) {
172 close(fd);
173 return;
174 }
175
883a6142
DW
176 if (!strstr(buf, "mdmon"))
177 return;
178
179 kill(pid, SIGTERM);
180
181 mdstat = mdstat_read(0, 0);
182 for ( ; mdstat; mdstat = mdstat->next)
183 if (is_container_member(mdstat, devname)) {
184 sprintf(buf, "/dev/%s", mdstat->dev);
27dec8fa 185 WaitClean(buf, 0);
883a6142
DW
186 }
187 free_mdstat(mdstat);
188 remove_pidfile(devname);
b109d928
DW
189}
190
e0d6609f
NB
191void remove_pidfile(char *devname)
192{
193 char buf[100];
194
6144ed44
DW
195 if (sigterm)
196 return;
197
e0d6609f
NB
198 sprintf(buf, "/var/run/mdadm/%s.pid", devname);
199 unlink(buf);
57752795
N
200 sprintf(buf, "/var/run/mdadm/%s.sock", devname);
201 unlink(buf);
e0d6609f
NB
202}
203
295646b3 204int make_control_sock(char *devname)
549e9569
NB
205{
206 char path[100];
207 int sfd;
208 long fl;
209 struct sockaddr_un addr;
210
6144ed44
DW
211 if (sigterm)
212 return -1;
213
549e9569
NB
214 sprintf(path, "/var/run/mdadm/%s.sock", devname);
215 unlink(path);
216 sfd = socket(PF_LOCAL, SOCK_STREAM, 0);
217 if (sfd < 0)
218 return -1;
219
220 addr.sun_family = PF_LOCAL;
221 strcpy(addr.sun_path, path);
222 if (bind(sfd, &addr, sizeof(addr)) < 0) {
223 close(sfd);
224 return -1;
225 }
226 listen(sfd, 10);
227 fl = fcntl(sfd, F_GETFL, 0);
228 fl |= O_NONBLOCK;
229 fcntl(sfd, F_SETFL, fl);
230 return sfd;
231}
232
295646b3
DW
233int socket_hup_requested;
234static void hup(int sig)
235{
236 socket_hup_requested = 1;
237}
238
6144ed44
DW
239static void term(int sig)
240{
241 sigterm = 1;
242}
243
4d43913c
NB
244static void wake_me(int sig)
245{
246
247}
248
16ddab0d
DW
249/* if we are debugging and starting mdmon by hand then don't fork */
250static int do_fork(void)
251{
252 #ifdef DEBUG
40ebbb9c 253 if (check_env("MDADM_NO_MDMON"))
16ddab0d
DW
254 return 0;
255 #endif
256
257 return 1;
258}
259
13047e4c
DW
260void usage(void)
261{
262 fprintf(stderr, "Usage: mdmon [--switch-root dir] /device/name/for/container\n");
263 exit(2);
264}
16ddab0d 265
549e9569
NB
266int main(int argc, char *argv[])
267{
268 int mdfd;
549e9569
NB
269 struct mdinfo *mdi, *di;
270 struct supertype *container;
4d43913c 271 sigset_t set;
bfa44e2e 272 struct sigaction act;
9fe32043
N
273 int pfd[2];
274 int status;
3d2c4fc7 275 int ignore;
13047e4c
DW
276 char *container_name = NULL;
277 char *switchroot = NULL;
278
279 switch (argc) {
280 case 2:
281 container_name = argv[1];
282 break;
283 case 4:
284 if (strcmp(argv[1], "--switch-root") != 0) {
285 fprintf(stderr, "mdmon: unknown argument %s\n", argv[1]);
286 usage();
287 }
288 switchroot = argv[2];
289 container_name = argv[3];
290 break;
291 default:
292 usage();
549e9569 293 }
13047e4c
DW
294
295 mdfd = open(container_name, O_RDWR);
549e9569 296 if (mdfd < 0) {
13047e4c 297 fprintf(stderr, "mdmon: %s: %s\n", container_name,
549e9569
NB
298 strerror(errno));
299 exit(1);
300 }
301 if (md_get_version(mdfd) < 0) {
13047e4c
DW
302 fprintf(stderr, "mdmon: %s: Not an md device\n",
303 container_name);
549e9569
NB
304 exit(1);
305 }
306
9fe32043 307 /* Fork, and have the child tell us when they are ready */
16ddab0d 308 if (do_fork()) {
3d2c4fc7
DW
309 if (pipe(pfd) != 0) {
310 fprintf(stderr, "mdmon: failed to create pipe\n");
311 exit(1);
312 }
16ddab0d
DW
313 switch(fork()) {
314 case -1:
315 fprintf(stderr, "mdmon: failed to fork: %s\n",
316 strerror(errno));
317 exit(1);
318 case 0: /* child */
319 close(pfd[0]);
320 break;
321 default: /* parent */
322 close(pfd[1]);
323 if (read(pfd[0], &status, sizeof(status)) != sizeof(status)) {
324 wait(&status);
325 status = WEXITSTATUS(status);
326 }
327 exit(status);
9fe32043 328 }
16ddab0d
DW
329 } else
330 pfd[0] = pfd[1] = -1;
549e9569
NB
331
332 container = malloc(sizeof(*container));
549e9569
NB
333 container->devnum = fd2devnum(mdfd);
334 container->devname = devnum2devname(container->devnum);
13047e4c
DW
335 container->device_name = container_name;
336 container->arrays = NULL;
337
338 if (!container->devname) {
339 fprintf(stderr, "mdmon: failed to allocate container name string\n");
340 exit(3);
341 }
342
343 mdi = sysfs_read(mdfd, container->devnum,
344 GET_VERSION|GET_LEVEL|GET_DEVS);
345
346 if (!mdi) {
347 fprintf(stderr, "mdmon: failed to load sysfs info for %s\n",
348 container->devname);
349 exit(3);
350 }
351 if (mdi->array.level != UnSet) {
352 fprintf(stderr, "mdmon: %s is not a container - cannot monitor\n",
353 container_name);
354 exit(3);
355 }
356 if (mdi->array.major_version != -1 ||
357 mdi->array.minor_version != -2) {
358 fprintf(stderr, "mdmon: %s does not use external metadata - cannot monitor\n",
359 container_name);
360 exit(3);
361 }
362
363 container->ss = find_metadata_methods(mdi->text_version);
364 if (container->ss == NULL) {
365 fprintf(stderr, "mdmon: %s uses unknown metadata: %s\n",
366 container_name, mdi->text_version);
367 exit(3);
368 }
369
370 container->devs = NULL;
371 for (di = mdi->devs; di; di = di->next) {
372 struct mdinfo *cd = malloc(sizeof(*cd));
373 *cd = *di;
374 cd->next = container->devs;
375 container->devs = cd;
376 }
377 sysfs_free(mdi);
549e9569 378
883a6142
DW
379 /* SIGUSR is sent between parent and child. So both block it
380 * and enable it only with pselect.
381 */
382 sigemptyset(&set);
383 sigaddset(&set, SIGUSR1);
384 sigaddset(&set, SIGHUP);
385 sigaddset(&set, SIGALRM);
386 sigaddset(&set, SIGTERM);
387 sigprocmask(SIG_BLOCK, &set, NULL);
388 act.sa_handler = wake_me;
389 act.sa_flags = 0;
390 sigaction(SIGUSR1, &act, NULL);
391 sigaction(SIGALRM, &act, NULL);
392 act.sa_handler = hup;
393 sigaction(SIGHUP, &act, NULL);
394 act.sa_handler = term;
395 sigaction(SIGTERM, &act, NULL);
396 act.sa_handler = SIG_IGN;
397 sigaction(SIGPIPE, &act, NULL);
398
13047e4c
DW
399 if (switchroot) {
400 /* we assume we assume that /sys /proc /dev are available in
401 * the new root (see nash:setuproot)
402 *
403 * kill any monitors in the current namespace and change
404 * to the new one
405 */
406 try_kill_monitor(container->devname);
407 if (chroot(switchroot) != 0) {
408 fprintf(stderr, "mdmon: failed to chroot to '%s': %s\n",
409 switchroot, strerror(errno));
410 exit(4);
411 }
412 }
413
414 /* If this fails, we hope it already exists
415 * pid file lives in /var/run/mdadm/mdXX.pid
416 */
417 mkdir("/var", 0600);
418 mkdir("/var/run", 0600);
549e9569 419 mkdir("/var/run/mdadm", 0600);
13047e4c 420 ignore = chdir("/");
b109d928
DW
421 if (make_pidfile(container->devname, O_EXCL) < 0) {
422 if (ping_monitor(container->devname) == 0) {
423 fprintf(stderr, "mdmon: %s already managed\n",
424 container->devname);
425 exit(3);
426 } else {
295646b3
DW
427 int err;
428
b109d928
DW
429 /* cleanup the old monitor, this one is taking over */
430 try_kill_monitor(container->devname);
295646b3
DW
431 err = make_pidfile(container->devname, 0);
432 if (err < 0) {
b109d928
DW
433 fprintf(stderr, "mdmon: %s Cannot create pidfile\n",
434 container->devname);
295646b3
DW
435 if (err == -EROFS) {
436 /* FIXME implement a mechanism to
437 * prevent duplicate monitor instances
438 */
439 fprintf(stderr,
440 "mdmon: continuing on read-only file system\n");
441 } else
442 exit(3);
b109d928
DW
443 }
444 }
549e9569 445 }
549e9569 446 container->sock = make_control_sock(container->devname);
549e9569 447
13047e4c 448 if (container->ss->load_super(container, mdfd, container_name)) {
549e9569 449 fprintf(stderr, "mdmon: Cannot load metadata for %s\n",
13047e4c 450 container_name);
549e9569
NB
451 exit(3);
452 }
549e9569 453
9fe32043
N
454 /* Ok, this is close enough. We can say goodbye to our parent now.
455 */
456 status = 0;
3d2c4fc7
DW
457 if (write(pfd[1], &status, sizeof(status)) < 0)
458 fprintf(stderr, "mdmon: failed to notify our parent: %d\n",
459 getppid());
9fe32043
N
460 close(pfd[1]);
461
9fe32043
N
462 setsid();
463 close(0);
464 open("/dev/null", O_RDWR);
465 close(1);
3d2c4fc7 466 ignore = dup(0);
9fe32043
N
467#ifndef DEBUG
468 close(2);
3d2c4fc7 469 ignore = dup(0);
9fe32043
N
470#endif
471
549e9569
NB
472 mlockall(MCL_FUTURE);
473
3e70c845 474 if (clone_monitor(container) < 0) {
295646b3 475 fprintf(stderr, "mdmon: failed to start monitor process: %s\n",
549e9569
NB
476 strerror(errno));
477 exit(2);
478 }
479
480 do_manager(container);
481
482 exit(0);
483}