]> git.ipfire.org Git - thirdparty/mdadm.git/blame - mdmon.c
mdmon: --switch-root
[thirdparty/mdadm.git] / mdmon.c
CommitLineData
549e9569
NB
1
2/*
3 * md array manager.
4 * When md arrays have user-space managed metadata, this is the program
5 * that does the managing.
6 *
7 * Given one argument: the name of the array (e.g. /dev/md0) that is
8 * the container.
9 * We fork off a helper that runs high priority and mlocked. It responds to
10 * device failures and other events that might stop writeout, or that are
11 * trivial to deal with.
12 * The main thread then watches for new arrays being created in the container
13 * and starts monitoring them too ... along with a few other tasks.
14 *
15 * The main thread communicates with the priority thread by writing over
16 * a pipe.
17 * Separate programs can communicate with the main thread via Unix-domain
18 * socket.
19 * The two threads share address space and open file table.
20 *
21 */
22
23#ifndef _GNU_SOURCE
24#define _GNU_SOURCE
25#endif
26
27#include <unistd.h>
28#include <stdlib.h>
4d43913c 29#include <sys/types.h>
549e9569
NB
30#include <sys/stat.h>
31#include <sys/socket.h>
32#include <sys/un.h>
33#include <sys/mman.h>
4d43913c 34#include <sys/syscall.h>
9fe32043 35#include <sys/wait.h>
549e9569
NB
36#include <stdio.h>
37#include <errno.h>
38#include <string.h>
39#include <fcntl.h>
b109d928 40#include <signal.h>
13047e4c 41#include <dirent.h>
549e9569
NB
42
43#include <sched.h>
44
45#include "mdadm.h"
46#include "mdmon.h"
47
549e9569
NB
48struct active_array *discard_this;
49struct active_array *pending_discard;
4d43913c
NB
50
51int mon_tid, mgr_tid;
549e9569 52
6144ed44
DW
53int sigterm;
54
549e9569
NB
55int run_child(void *v)
56{
57 struct supertype *c = v;
1ed3f387 58
549e9569
NB
59 do_monitor(c);
60 return 0;
61}
62
63int clone_monitor(struct supertype *container)
64{
549e9569 65 static char stack[4096];
549e9569 66
2cc98f9e 67 mon_tid = clone(run_child, stack+4096-64,
549e9569
NB
68 CLONE_FS|CLONE_FILES|CLONE_VM|CLONE_SIGHAND|CLONE_THREAD,
69 container);
3e70c845 70
4d43913c 71 mgr_tid = syscall(SYS_gettid);
2cc98f9e
DW
72
73 return mon_tid;
549e9569
NB
74}
75
76static struct superswitch *find_metadata_methods(char *vers)
77{
78 if (strcmp(vers, "ddf") == 0)
79 return &super_ddf;
5b65005f
DW
80 if (strcmp(vers, "imsm") == 0)
81 return &super_imsm;
549e9569
NB
82 return NULL;
83}
84
85
295646b3 86int make_pidfile(char *devname, int o_excl)
549e9569
NB
87{
88 char path[100];
89 char pid[10];
90 int fd;
3d2c4fc7
DW
91 int n;
92
6144ed44
DW
93 if (sigterm)
94 return -1;
95
549e9569
NB
96 sprintf(path, "/var/run/mdadm/%s.pid", devname);
97
b109d928 98 fd = open(path, O_RDWR|O_CREAT|o_excl, 0600);
549e9569 99 if (fd < 0)
295646b3 100 return -errno;
549e9569 101 sprintf(pid, "%d\n", getpid());
3d2c4fc7 102 n = write(fd, pid, strlen(pid));
549e9569 103 close(fd);
3d2c4fc7
DW
104 if (n < 0)
105 return -errno;
549e9569
NB
106 return 0;
107}
108
883a6142
DW
109int is_container_member(struct mdstat_ent *mdstat, char *container)
110{
111 if (mdstat->metadata_version == NULL ||
112 strncmp(mdstat->metadata_version, "external:", 9) != 0 ||
113 !is_subarray(mdstat->metadata_version+9) ||
114 strncmp(mdstat->metadata_version+10, container, strlen(container)) != 0 ||
115 mdstat->metadata_version[10+strlen(container)] != '/')
116 return 0;
117
118 return 1;
119}
120
121void remove_pidfile(char *devname);
b109d928
DW
122static void try_kill_monitor(char *devname)
123{
124 char buf[100];
125 int fd;
126 pid_t pid;
883a6142 127 struct mdstat_ent *mdstat;
b109d928
DW
128
129 sprintf(buf, "/var/run/mdadm/%s.pid", devname);
130 fd = open(buf, O_RDONLY);
131 if (fd < 0)
132 return;
133
134 if (read(fd, buf, sizeof(buf)) < 0) {
135 close(fd);
136 return;
137 }
138
139 close(fd);
140 pid = strtoul(buf, NULL, 10);
141
142 /* kill this process if it is mdmon */
143 sprintf(buf, "/proc/%lu/cmdline", (unsigned long) pid);
144 fd = open(buf, O_RDONLY);
145 if (fd < 0)
146 return;
147
148 if (read(fd, buf, sizeof(buf)) < 0) {
149 close(fd);
150 return;
151 }
152
883a6142
DW
153 if (!strstr(buf, "mdmon"))
154 return;
155
156 kill(pid, SIGTERM);
157
158 mdstat = mdstat_read(0, 0);
159 for ( ; mdstat; mdstat = mdstat->next)
160 if (is_container_member(mdstat, devname)) {
161 sprintf(buf, "/dev/%s", mdstat->dev);
162 WaitClean(buf);
163 }
164 free_mdstat(mdstat);
165 remove_pidfile(devname);
b109d928
DW
166}
167
e0d6609f
NB
168void remove_pidfile(char *devname)
169{
170 char buf[100];
171
6144ed44
DW
172 if (sigterm)
173 return;
174
e0d6609f
NB
175 sprintf(buf, "/var/run/mdadm/%s.pid", devname);
176 unlink(buf);
57752795
N
177 sprintf(buf, "/var/run/mdadm/%s.sock", devname);
178 unlink(buf);
e0d6609f
NB
179}
180
295646b3 181int make_control_sock(char *devname)
549e9569
NB
182{
183 char path[100];
184 int sfd;
185 long fl;
186 struct sockaddr_un addr;
187
6144ed44
DW
188 if (sigterm)
189 return -1;
190
549e9569
NB
191 sprintf(path, "/var/run/mdadm/%s.sock", devname);
192 unlink(path);
193 sfd = socket(PF_LOCAL, SOCK_STREAM, 0);
194 if (sfd < 0)
195 return -1;
196
197 addr.sun_family = PF_LOCAL;
198 strcpy(addr.sun_path, path);
199 if (bind(sfd, &addr, sizeof(addr)) < 0) {
200 close(sfd);
201 return -1;
202 }
203 listen(sfd, 10);
204 fl = fcntl(sfd, F_GETFL, 0);
205 fl |= O_NONBLOCK;
206 fcntl(sfd, F_SETFL, fl);
207 return sfd;
208}
209
295646b3
DW
210int socket_hup_requested;
211static void hup(int sig)
212{
213 socket_hup_requested = 1;
214}
215
6144ed44
DW
216static void term(int sig)
217{
218 sigterm = 1;
219}
220
4d43913c
NB
221static void wake_me(int sig)
222{
223
224}
225
16ddab0d
DW
226/* if we are debugging and starting mdmon by hand then don't fork */
227static int do_fork(void)
228{
229 #ifdef DEBUG
230 if (env_no_mdmon())
231 return 0;
232 #endif
233
234 return 1;
235}
236
13047e4c
DW
237void usage(void)
238{
239 fprintf(stderr, "Usage: mdmon [--switch-root dir] /device/name/for/container\n");
240 exit(2);
241}
16ddab0d 242
549e9569
NB
243int main(int argc, char *argv[])
244{
245 int mdfd;
549e9569
NB
246 struct mdinfo *mdi, *di;
247 struct supertype *container;
4d43913c 248 sigset_t set;
bfa44e2e 249 struct sigaction act;
9fe32043
N
250 int pfd[2];
251 int status;
3d2c4fc7 252 int ignore;
13047e4c
DW
253 char *container_name = NULL;
254 char *switchroot = NULL;
255
256 switch (argc) {
257 case 2:
258 container_name = argv[1];
259 break;
260 case 4:
261 if (strcmp(argv[1], "--switch-root") != 0) {
262 fprintf(stderr, "mdmon: unknown argument %s\n", argv[1]);
263 usage();
264 }
265 switchroot = argv[2];
266 container_name = argv[3];
267 break;
268 default:
269 usage();
549e9569 270 }
13047e4c
DW
271
272 mdfd = open(container_name, O_RDWR);
549e9569 273 if (mdfd < 0) {
13047e4c 274 fprintf(stderr, "mdmon: %s: %s\n", container_name,
549e9569
NB
275 strerror(errno));
276 exit(1);
277 }
278 if (md_get_version(mdfd) < 0) {
13047e4c
DW
279 fprintf(stderr, "mdmon: %s: Not an md device\n",
280 container_name);
549e9569
NB
281 exit(1);
282 }
283
9fe32043 284 /* Fork, and have the child tell us when they are ready */
16ddab0d 285 if (do_fork()) {
3d2c4fc7
DW
286 if (pipe(pfd) != 0) {
287 fprintf(stderr, "mdmon: failed to create pipe\n");
288 exit(1);
289 }
16ddab0d
DW
290 switch(fork()) {
291 case -1:
292 fprintf(stderr, "mdmon: failed to fork: %s\n",
293 strerror(errno));
294 exit(1);
295 case 0: /* child */
296 close(pfd[0]);
297 break;
298 default: /* parent */
299 close(pfd[1]);
300 if (read(pfd[0], &status, sizeof(status)) != sizeof(status)) {
301 wait(&status);
302 status = WEXITSTATUS(status);
303 }
304 exit(status);
9fe32043 305 }
16ddab0d
DW
306 } else
307 pfd[0] = pfd[1] = -1;
549e9569
NB
308
309 container = malloc(sizeof(*container));
549e9569
NB
310 container->devnum = fd2devnum(mdfd);
311 container->devname = devnum2devname(container->devnum);
13047e4c
DW
312 container->device_name = container_name;
313 container->arrays = NULL;
314
315 if (!container->devname) {
316 fprintf(stderr, "mdmon: failed to allocate container name string\n");
317 exit(3);
318 }
319
320 mdi = sysfs_read(mdfd, container->devnum,
321 GET_VERSION|GET_LEVEL|GET_DEVS);
322
323 if (!mdi) {
324 fprintf(stderr, "mdmon: failed to load sysfs info for %s\n",
325 container->devname);
326 exit(3);
327 }
328 if (mdi->array.level != UnSet) {
329 fprintf(stderr, "mdmon: %s is not a container - cannot monitor\n",
330 container_name);
331 exit(3);
332 }
333 if (mdi->array.major_version != -1 ||
334 mdi->array.minor_version != -2) {
335 fprintf(stderr, "mdmon: %s does not use external metadata - cannot monitor\n",
336 container_name);
337 exit(3);
338 }
339
340 container->ss = find_metadata_methods(mdi->text_version);
341 if (container->ss == NULL) {
342 fprintf(stderr, "mdmon: %s uses unknown metadata: %s\n",
343 container_name, mdi->text_version);
344 exit(3);
345 }
346
347 container->devs = NULL;
348 for (di = mdi->devs; di; di = di->next) {
349 struct mdinfo *cd = malloc(sizeof(*cd));
350 *cd = *di;
351 cd->next = container->devs;
352 container->devs = cd;
353 }
354 sysfs_free(mdi);
549e9569 355
883a6142
DW
356 /* SIGUSR is sent between parent and child. So both block it
357 * and enable it only with pselect.
358 */
359 sigemptyset(&set);
360 sigaddset(&set, SIGUSR1);
361 sigaddset(&set, SIGHUP);
362 sigaddset(&set, SIGALRM);
363 sigaddset(&set, SIGTERM);
364 sigprocmask(SIG_BLOCK, &set, NULL);
365 act.sa_handler = wake_me;
366 act.sa_flags = 0;
367 sigaction(SIGUSR1, &act, NULL);
368 sigaction(SIGALRM, &act, NULL);
369 act.sa_handler = hup;
370 sigaction(SIGHUP, &act, NULL);
371 act.sa_handler = term;
372 sigaction(SIGTERM, &act, NULL);
373 act.sa_handler = SIG_IGN;
374 sigaction(SIGPIPE, &act, NULL);
375
13047e4c
DW
376 if (switchroot) {
377 /* we assume we assume that /sys /proc /dev are available in
378 * the new root (see nash:setuproot)
379 *
380 * kill any monitors in the current namespace and change
381 * to the new one
382 */
383 try_kill_monitor(container->devname);
384 if (chroot(switchroot) != 0) {
385 fprintf(stderr, "mdmon: failed to chroot to '%s': %s\n",
386 switchroot, strerror(errno));
387 exit(4);
388 }
389 }
390
391 /* If this fails, we hope it already exists
392 * pid file lives in /var/run/mdadm/mdXX.pid
393 */
394 mkdir("/var", 0600);
395 mkdir("/var/run", 0600);
549e9569 396 mkdir("/var/run/mdadm", 0600);
13047e4c 397 ignore = chdir("/");
b109d928
DW
398 if (make_pidfile(container->devname, O_EXCL) < 0) {
399 if (ping_monitor(container->devname) == 0) {
400 fprintf(stderr, "mdmon: %s already managed\n",
401 container->devname);
402 exit(3);
403 } else {
295646b3
DW
404 int err;
405
b109d928
DW
406 /* cleanup the old monitor, this one is taking over */
407 try_kill_monitor(container->devname);
295646b3
DW
408 err = make_pidfile(container->devname, 0);
409 if (err < 0) {
b109d928
DW
410 fprintf(stderr, "mdmon: %s Cannot create pidfile\n",
411 container->devname);
295646b3
DW
412 if (err == -EROFS) {
413 /* FIXME implement a mechanism to
414 * prevent duplicate monitor instances
415 */
416 fprintf(stderr,
417 "mdmon: continuing on read-only file system\n");
418 } else
419 exit(3);
b109d928
DW
420 }
421 }
549e9569 422 }
549e9569 423 container->sock = make_control_sock(container->devname);
549e9569 424
13047e4c 425 if (container->ss->load_super(container, mdfd, container_name)) {
549e9569 426 fprintf(stderr, "mdmon: Cannot load metadata for %s\n",
13047e4c 427 container_name);
549e9569
NB
428 exit(3);
429 }
549e9569 430
9fe32043
N
431 /* Ok, this is close enough. We can say goodbye to our parent now.
432 */
433 status = 0;
3d2c4fc7
DW
434 if (write(pfd[1], &status, sizeof(status)) < 0)
435 fprintf(stderr, "mdmon: failed to notify our parent: %d\n",
436 getppid());
9fe32043
N
437 close(pfd[1]);
438
9fe32043
N
439 setsid();
440 close(0);
441 open("/dev/null", O_RDWR);
442 close(1);
3d2c4fc7 443 ignore = dup(0);
9fe32043
N
444#ifndef DEBUG
445 close(2);
3d2c4fc7 446 ignore = dup(0);
9fe32043
N
447#endif
448
549e9569
NB
449 mlockall(MCL_FUTURE);
450
3e70c845 451 if (clone_monitor(container) < 0) {
295646b3 452 fprintf(stderr, "mdmon: failed to start monitor process: %s\n",
549e9569
NB
453 strerror(errno));
454 exit(2);
455 }
456
457 do_manager(container);
458
459 exit(0);
460}