Merge branch 'master' into devel-3.0
[thirdparty/mdadm.git] / mdmon.c
CommitLineData
549e9569
NB
1
2/*
3 * md array manager.
4 * When md arrays have user-space managed metadata, this is the program
5 * that does the managing.
6 *
7 * Given one argument: the name of the array (e.g. /dev/md0) that is
8 * the container.
9 * We fork off a helper that runs high priority and mlocked. It responds to
10 * device failures and other events that might stop writeout, or that are
11 * trivial to deal with.
12 * The main thread then watches for new arrays being created in the container
13 * and starts monitoring them too ... along with a few other tasks.
14 *
15 * The main thread communicates with the priority thread by writing over
16 * a pipe.
17 * Separate programs can communicate with the main thread via Unix-domain
18 * socket.
19 * The two threads share address space and open file table.
20 *
21 */
22
23#ifndef _GNU_SOURCE
24#define _GNU_SOURCE
25#endif
26
27#include <unistd.h>
28#include <stdlib.h>
4d43913c 29#include <sys/types.h>
549e9569
NB
30#include <sys/stat.h>
31#include <sys/socket.h>
32#include <sys/un.h>
33#include <sys/mman.h>
4d43913c 34#include <sys/syscall.h>
9fe32043 35#include <sys/wait.h>
549e9569
NB
36#include <stdio.h>
37#include <errno.h>
38#include <string.h>
39#include <fcntl.h>
b109d928 40#include <signal.h>
13047e4c 41#include <dirent.h>
549e9569
NB
42
43#include <sched.h>
44
45#include "mdadm.h"
46#include "mdmon.h"
47
549e9569
NB
48struct active_array *discard_this;
49struct active_array *pending_discard;
4d43913c
NB
50
51int mon_tid, mgr_tid;
549e9569 52
6144ed44
DW
53int sigterm;
54
549e9569
NB
55int run_child(void *v)
56{
57 struct supertype *c = v;
1ed3f387 58
549e9569
NB
59 do_monitor(c);
60 return 0;
61}
62
63int clone_monitor(struct supertype *container)
64{
549e9569 65 static char stack[4096];
549e9569 66
2cc98f9e 67 mon_tid = clone(run_child, stack+4096-64,
549e9569
NB
68 CLONE_FS|CLONE_FILES|CLONE_VM|CLONE_SIGHAND|CLONE_THREAD,
69 container);
3e70c845 70
4d43913c 71 mgr_tid = syscall(SYS_gettid);
2cc98f9e
DW
72
73 return mon_tid;
549e9569
NB
74}
75
76static struct superswitch *find_metadata_methods(char *vers)
77{
78 if (strcmp(vers, "ddf") == 0)
79 return &super_ddf;
5b65005f
DW
80 if (strcmp(vers, "imsm") == 0)
81 return &super_imsm;
549e9569
NB
82 return NULL;
83}
84
85
295646b3 86int make_pidfile(char *devname, int o_excl)
549e9569
NB
87{
88 char path[100];
89 char pid[10];
90 int fd;
3d2c4fc7
DW
91 int n;
92
6144ed44
DW
93 if (sigterm)
94 return -1;
95
549e9569
NB
96 sprintf(path, "/var/run/mdadm/%s.pid", devname);
97
b109d928 98 fd = open(path, O_RDWR|O_CREAT|o_excl, 0600);
549e9569 99 if (fd < 0)
295646b3 100 return -errno;
549e9569 101 sprintf(pid, "%d\n", getpid());
3d2c4fc7 102 n = write(fd, pid, strlen(pid));
549e9569 103 close(fd);
3d2c4fc7
DW
104 if (n < 0)
105 return -errno;
549e9569
NB
106 return 0;
107}
108
883a6142
DW
109int is_container_member(struct mdstat_ent *mdstat, char *container)
110{
111 if (mdstat->metadata_version == NULL ||
112 strncmp(mdstat->metadata_version, "external:", 9) != 0 ||
113 !is_subarray(mdstat->metadata_version+9) ||
114 strncmp(mdstat->metadata_version+10, container, strlen(container)) != 0 ||
115 mdstat->metadata_version[10+strlen(container)] != '/')
116 return 0;
117
118 return 1;
119}
120
121void remove_pidfile(char *devname);
b109d928
DW
122static void try_kill_monitor(char *devname)
123{
124 char buf[100];
125 int fd;
126 pid_t pid;
883a6142 127 struct mdstat_ent *mdstat;
b109d928
DW
128
129 sprintf(buf, "/var/run/mdadm/%s.pid", devname);
130 fd = open(buf, O_RDONLY);
131 if (fd < 0)
132 return;
133
134 if (read(fd, buf, sizeof(buf)) < 0) {
135 close(fd);
136 return;
137 }
138
139 close(fd);
140 pid = strtoul(buf, NULL, 10);
141
8aae4219
DW
142 /* first rule of survival... don't off yourself */
143 if (pid == getpid())
144 return;
145
b109d928
DW
146 /* kill this process if it is mdmon */
147 sprintf(buf, "/proc/%lu/cmdline", (unsigned long) pid);
148 fd = open(buf, O_RDONLY);
149 if (fd < 0)
150 return;
151
152 if (read(fd, buf, sizeof(buf)) < 0) {
153 close(fd);
154 return;
155 }
156
883a6142
DW
157 if (!strstr(buf, "mdmon"))
158 return;
159
160 kill(pid, SIGTERM);
161
162 mdstat = mdstat_read(0, 0);
163 for ( ; mdstat; mdstat = mdstat->next)
164 if (is_container_member(mdstat, devname)) {
165 sprintf(buf, "/dev/%s", mdstat->dev);
27dec8fa 166 WaitClean(buf, 0);
883a6142
DW
167 }
168 free_mdstat(mdstat);
169 remove_pidfile(devname);
b109d928
DW
170}
171
e0d6609f
NB
172void remove_pidfile(char *devname)
173{
174 char buf[100];
175
6144ed44
DW
176 if (sigterm)
177 return;
178
e0d6609f
NB
179 sprintf(buf, "/var/run/mdadm/%s.pid", devname);
180 unlink(buf);
57752795
N
181 sprintf(buf, "/var/run/mdadm/%s.sock", devname);
182 unlink(buf);
e0d6609f
NB
183}
184
295646b3 185int make_control_sock(char *devname)
549e9569
NB
186{
187 char path[100];
188 int sfd;
189 long fl;
190 struct sockaddr_un addr;
191
6144ed44
DW
192 if (sigterm)
193 return -1;
194
549e9569
NB
195 sprintf(path, "/var/run/mdadm/%s.sock", devname);
196 unlink(path);
197 sfd = socket(PF_LOCAL, SOCK_STREAM, 0);
198 if (sfd < 0)
199 return -1;
200
201 addr.sun_family = PF_LOCAL;
202 strcpy(addr.sun_path, path);
203 if (bind(sfd, &addr, sizeof(addr)) < 0) {
204 close(sfd);
205 return -1;
206 }
207 listen(sfd, 10);
208 fl = fcntl(sfd, F_GETFL, 0);
209 fl |= O_NONBLOCK;
210 fcntl(sfd, F_SETFL, fl);
211 return sfd;
212}
213
295646b3
DW
214int socket_hup_requested;
215static void hup(int sig)
216{
217 socket_hup_requested = 1;
218}
219
6144ed44
DW
220static void term(int sig)
221{
222 sigterm = 1;
223}
224
4d43913c
NB
225static void wake_me(int sig)
226{
227
228}
229
16ddab0d
DW
230/* if we are debugging and starting mdmon by hand then don't fork */
231static int do_fork(void)
232{
233 #ifdef DEBUG
234 if (env_no_mdmon())
235 return 0;
236 #endif
237
238 return 1;
239}
240
13047e4c
DW
241void usage(void)
242{
243 fprintf(stderr, "Usage: mdmon [--switch-root dir] /device/name/for/container\n");
244 exit(2);
245}
16ddab0d 246
549e9569
NB
247int main(int argc, char *argv[])
248{
249 int mdfd;
549e9569
NB
250 struct mdinfo *mdi, *di;
251 struct supertype *container;
4d43913c 252 sigset_t set;
bfa44e2e 253 struct sigaction act;
9fe32043
N
254 int pfd[2];
255 int status;
3d2c4fc7 256 int ignore;
13047e4c
DW
257 char *container_name = NULL;
258 char *switchroot = NULL;
259
260 switch (argc) {
261 case 2:
262 container_name = argv[1];
263 break;
264 case 4:
265 if (strcmp(argv[1], "--switch-root") != 0) {
266 fprintf(stderr, "mdmon: unknown argument %s\n", argv[1]);
267 usage();
268 }
269 switchroot = argv[2];
270 container_name = argv[3];
271 break;
272 default:
273 usage();
549e9569 274 }
13047e4c
DW
275
276 mdfd = open(container_name, O_RDWR);
549e9569 277 if (mdfd < 0) {
13047e4c 278 fprintf(stderr, "mdmon: %s: %s\n", container_name,
549e9569
NB
279 strerror(errno));
280 exit(1);
281 }
282 if (md_get_version(mdfd) < 0) {
13047e4c
DW
283 fprintf(stderr, "mdmon: %s: Not an md device\n",
284 container_name);
549e9569
NB
285 exit(1);
286 }
287
9fe32043 288 /* Fork, and have the child tell us when they are ready */
16ddab0d 289 if (do_fork()) {
3d2c4fc7
DW
290 if (pipe(pfd) != 0) {
291 fprintf(stderr, "mdmon: failed to create pipe\n");
292 exit(1);
293 }
16ddab0d
DW
294 switch(fork()) {
295 case -1:
296 fprintf(stderr, "mdmon: failed to fork: %s\n",
297 strerror(errno));
298 exit(1);
299 case 0: /* child */
300 close(pfd[0]);
301 break;
302 default: /* parent */
303 close(pfd[1]);
304 if (read(pfd[0], &status, sizeof(status)) != sizeof(status)) {
305 wait(&status);
306 status = WEXITSTATUS(status);
307 }
308 exit(status);
9fe32043 309 }
16ddab0d
DW
310 } else
311 pfd[0] = pfd[1] = -1;
549e9569
NB
312
313 container = malloc(sizeof(*container));
549e9569
NB
314 container->devnum = fd2devnum(mdfd);
315 container->devname = devnum2devname(container->devnum);
13047e4c
DW
316 container->device_name = container_name;
317 container->arrays = NULL;
318
319 if (!container->devname) {
320 fprintf(stderr, "mdmon: failed to allocate container name string\n");
321 exit(3);
322 }
323
324 mdi = sysfs_read(mdfd, container->devnum,
325 GET_VERSION|GET_LEVEL|GET_DEVS);
326
327 if (!mdi) {
328 fprintf(stderr, "mdmon: failed to load sysfs info for %s\n",
329 container->devname);
330 exit(3);
331 }
332 if (mdi->array.level != UnSet) {
333 fprintf(stderr, "mdmon: %s is not a container - cannot monitor\n",
334 container_name);
335 exit(3);
336 }
337 if (mdi->array.major_version != -1 ||
338 mdi->array.minor_version != -2) {
339 fprintf(stderr, "mdmon: %s does not use external metadata - cannot monitor\n",
340 container_name);
341 exit(3);
342 }
343
344 container->ss = find_metadata_methods(mdi->text_version);
345 if (container->ss == NULL) {
346 fprintf(stderr, "mdmon: %s uses unknown metadata: %s\n",
347 container_name, mdi->text_version);
348 exit(3);
349 }
350
351 container->devs = NULL;
352 for (di = mdi->devs; di; di = di->next) {
353 struct mdinfo *cd = malloc(sizeof(*cd));
354 *cd = *di;
355 cd->next = container->devs;
356 container->devs = cd;
357 }
358 sysfs_free(mdi);
549e9569 359
883a6142
DW
360 /* SIGUSR is sent between parent and child. So both block it
361 * and enable it only with pselect.
362 */
363 sigemptyset(&set);
364 sigaddset(&set, SIGUSR1);
365 sigaddset(&set, SIGHUP);
366 sigaddset(&set, SIGALRM);
367 sigaddset(&set, SIGTERM);
368 sigprocmask(SIG_BLOCK, &set, NULL);
369 act.sa_handler = wake_me;
370 act.sa_flags = 0;
371 sigaction(SIGUSR1, &act, NULL);
372 sigaction(SIGALRM, &act, NULL);
373 act.sa_handler = hup;
374 sigaction(SIGHUP, &act, NULL);
375 act.sa_handler = term;
376 sigaction(SIGTERM, &act, NULL);
377 act.sa_handler = SIG_IGN;
378 sigaction(SIGPIPE, &act, NULL);
379
13047e4c
DW
380 if (switchroot) {
381 /* we assume we assume that /sys /proc /dev are available in
382 * the new root (see nash:setuproot)
383 *
384 * kill any monitors in the current namespace and change
385 * to the new one
386 */
387 try_kill_monitor(container->devname);
388 if (chroot(switchroot) != 0) {
389 fprintf(stderr, "mdmon: failed to chroot to '%s': %s\n",
390 switchroot, strerror(errno));
391 exit(4);
392 }
393 }
394
395 /* If this fails, we hope it already exists
396 * pid file lives in /var/run/mdadm/mdXX.pid
397 */
398 mkdir("/var", 0600);
399 mkdir("/var/run", 0600);
549e9569 400 mkdir("/var/run/mdadm", 0600);
13047e4c 401 ignore = chdir("/");
b109d928
DW
402 if (make_pidfile(container->devname, O_EXCL) < 0) {
403 if (ping_monitor(container->devname) == 0) {
404 fprintf(stderr, "mdmon: %s already managed\n",
405 container->devname);
406 exit(3);
407 } else {
295646b3
DW
408 int err;
409
b109d928
DW
410 /* cleanup the old monitor, this one is taking over */
411 try_kill_monitor(container->devname);
295646b3
DW
412 err = make_pidfile(container->devname, 0);
413 if (err < 0) {
b109d928
DW
414 fprintf(stderr, "mdmon: %s Cannot create pidfile\n",
415 container->devname);
295646b3
DW
416 if (err == -EROFS) {
417 /* FIXME implement a mechanism to
418 * prevent duplicate monitor instances
419 */
420 fprintf(stderr,
421 "mdmon: continuing on read-only file system\n");
422 } else
423 exit(3);
b109d928
DW
424 }
425 }
549e9569 426 }
549e9569 427 container->sock = make_control_sock(container->devname);
549e9569 428
13047e4c 429 if (container->ss->load_super(container, mdfd, container_name)) {
549e9569 430 fprintf(stderr, "mdmon: Cannot load metadata for %s\n",
13047e4c 431 container_name);
549e9569
NB
432 exit(3);
433 }
549e9569 434
9fe32043
N
435 /* Ok, this is close enough. We can say goodbye to our parent now.
436 */
437 status = 0;
3d2c4fc7
DW
438 if (write(pfd[1], &status, sizeof(status)) < 0)
439 fprintf(stderr, "mdmon: failed to notify our parent: %d\n",
440 getppid());
9fe32043
N
441 close(pfd[1]);
442
9fe32043
N
443 setsid();
444 close(0);
445 open("/dev/null", O_RDWR);
446 close(1);
3d2c4fc7 447 ignore = dup(0);
9fe32043
N
448#ifndef DEBUG
449 close(2);
3d2c4fc7 450 ignore = dup(0);
9fe32043
N
451#endif
452
549e9569
NB
453 mlockall(MCL_FUTURE);
454
3e70c845 455 if (clone_monitor(container) < 0) {
295646b3 456 fprintf(stderr, "mdmon: failed to start monitor process: %s\n",
549e9569
NB
457 strerror(errno));
458 exit(2);
459 }
460
461 do_manager(container);
462
463 exit(0);
464}