Treat all devices at the container level as spares
[thirdparty/mdadm.git] / mdmon.c
CommitLineData
549e9569
NB
1
2/*
3 * md array manager.
4 * When md arrays have user-space managed metadata, this is the program
5 * that does the managing.
6 *
7 * Given one argument: the name of the array (e.g. /dev/md0) that is
8 * the container.
9 * We fork off a helper that runs high priority and mlocked. It responds to
10 * device failures and other events that might stop writeout, or that are
11 * trivial to deal with.
12 * The main thread then watches for new arrays being created in the container
13 * and starts monitoring them too ... along with a few other tasks.
14 *
15 * The main thread communicates with the priority thread by writing over
16 * a pipe.
17 * Separate programs can communicate with the main thread via Unix-domain
18 * socket.
19 * The two threads share address space and open file table.
20 *
21 */
22
23#ifndef _GNU_SOURCE
24#define _GNU_SOURCE
25#endif
26
27#include <unistd.h>
28#include <stdlib.h>
4d43913c 29#include <sys/types.h>
549e9569
NB
30#include <sys/stat.h>
31#include <sys/socket.h>
32#include <sys/un.h>
33#include <sys/mman.h>
4d43913c 34#include <sys/syscall.h>
9fe32043 35#include <sys/wait.h>
549e9569
NB
36#include <stdio.h>
37#include <errno.h>
38#include <string.h>
39#include <fcntl.h>
b109d928 40#include <signal.h>
549e9569
NB
41
42#include <sched.h>
43
44#include "mdadm.h"
45#include "mdmon.h"
46
549e9569
NB
47struct active_array *discard_this;
48struct active_array *pending_discard;
4d43913c
NB
49
50int mon_tid, mgr_tid;
549e9569
NB
51
52int run_child(void *v)
53{
54 struct supertype *c = v;
1ed3f387 55
549e9569
NB
56 do_monitor(c);
57 return 0;
58}
59
60int clone_monitor(struct supertype *container)
61{
549e9569 62 static char stack[4096];
549e9569 63
2cc98f9e 64 mon_tid = clone(run_child, stack+4096-64,
549e9569
NB
65 CLONE_FS|CLONE_FILES|CLONE_VM|CLONE_SIGHAND|CLONE_THREAD,
66 container);
3e70c845 67
4d43913c 68 mgr_tid = syscall(SYS_gettid);
2cc98f9e
DW
69
70 return mon_tid;
549e9569
NB
71}
72
73static struct superswitch *find_metadata_methods(char *vers)
74{
75 if (strcmp(vers, "ddf") == 0)
76 return &super_ddf;
5b65005f
DW
77 if (strcmp(vers, "imsm") == 0)
78 return &super_imsm;
549e9569
NB
79 return NULL;
80}
81
82
295646b3 83int make_pidfile(char *devname, int o_excl)
549e9569
NB
84{
85 char path[100];
86 char pid[10];
87 int fd;
3d2c4fc7
DW
88 int n;
89
549e9569
NB
90 sprintf(path, "/var/run/mdadm/%s.pid", devname);
91
b109d928 92 fd = open(path, O_RDWR|O_CREAT|o_excl, 0600);
549e9569 93 if (fd < 0)
295646b3 94 return -errno;
549e9569 95 sprintf(pid, "%d\n", getpid());
3d2c4fc7 96 n = write(fd, pid, strlen(pid));
549e9569 97 close(fd);
3d2c4fc7
DW
98 if (n < 0)
99 return -errno;
549e9569
NB
100 return 0;
101}
102
b109d928
DW
103static void try_kill_monitor(char *devname)
104{
105 char buf[100];
106 int fd;
107 pid_t pid;
108
109 sprintf(buf, "/var/run/mdadm/%s.pid", devname);
110 fd = open(buf, O_RDONLY);
111 if (fd < 0)
112 return;
113
114 if (read(fd, buf, sizeof(buf)) < 0) {
115 close(fd);
116 return;
117 }
118
119 close(fd);
120 pid = strtoul(buf, NULL, 10);
121
122 /* kill this process if it is mdmon */
123 sprintf(buf, "/proc/%lu/cmdline", (unsigned long) pid);
124 fd = open(buf, O_RDONLY);
125 if (fd < 0)
126 return;
127
128 if (read(fd, buf, sizeof(buf)) < 0) {
129 close(fd);
130 return;
131 }
132
133 if (strstr(buf, "mdmon") != NULL)
134 kill(pid, SIGTERM);
135}
136
e0d6609f
NB
137void remove_pidfile(char *devname)
138{
139 char buf[100];
140
141 sprintf(buf, "/var/run/mdadm/%s.pid", devname);
142 unlink(buf);
57752795
N
143 sprintf(buf, "/var/run/mdadm/%s.sock", devname);
144 unlink(buf);
e0d6609f
NB
145}
146
295646b3 147int make_control_sock(char *devname)
549e9569
NB
148{
149 char path[100];
150 int sfd;
151 long fl;
152 struct sockaddr_un addr;
153
154 sprintf(path, "/var/run/mdadm/%s.sock", devname);
155 unlink(path);
156 sfd = socket(PF_LOCAL, SOCK_STREAM, 0);
157 if (sfd < 0)
158 return -1;
159
160 addr.sun_family = PF_LOCAL;
161 strcpy(addr.sun_path, path);
162 if (bind(sfd, &addr, sizeof(addr)) < 0) {
163 close(sfd);
164 return -1;
165 }
166 listen(sfd, 10);
167 fl = fcntl(sfd, F_GETFL, 0);
168 fl |= O_NONBLOCK;
169 fcntl(sfd, F_SETFL, fl);
170 return sfd;
171}
172
295646b3
DW
173int socket_hup_requested;
174static void hup(int sig)
175{
176 socket_hup_requested = 1;
177}
178
4d43913c
NB
179static void wake_me(int sig)
180{
181
182}
183
16ddab0d
DW
184/* if we are debugging and starting mdmon by hand then don't fork */
185static int do_fork(void)
186{
187 #ifdef DEBUG
188 if (env_no_mdmon())
189 return 0;
190 #endif
191
192 return 1;
193}
194
195
196
549e9569
NB
197int main(int argc, char *argv[])
198{
199 int mdfd;
549e9569
NB
200 struct mdinfo *mdi, *di;
201 struct supertype *container;
4d43913c 202 sigset_t set;
bfa44e2e 203 struct sigaction act;
9fe32043
N
204 int pfd[2];
205 int status;
3d2c4fc7 206 int ignore;
4d43913c 207
549e9569
NB
208 if (argc != 2) {
209 fprintf(stderr, "Usage: md-manage /device/name/for/container\n");
210 exit(2);
211 }
212 mdfd = open(argv[1], O_RDWR);
213 if (mdfd < 0) {
214 fprintf(stderr, "md-manage: %s: %s\n", argv[1],
215 strerror(errno));
216 exit(1);
217 }
218 if (md_get_version(mdfd) < 0) {
219 fprintf(stderr, "md-manage: %s: Not an md device\n",
220 argv[1]);
221 exit(1);
222 }
223
9fe32043 224 /* Fork, and have the child tell us when they are ready */
16ddab0d 225 if (do_fork()) {
3d2c4fc7
DW
226 if (pipe(pfd) != 0) {
227 fprintf(stderr, "mdmon: failed to create pipe\n");
228 exit(1);
229 }
16ddab0d
DW
230 switch(fork()) {
231 case -1:
232 fprintf(stderr, "mdmon: failed to fork: %s\n",
233 strerror(errno));
234 exit(1);
235 case 0: /* child */
236 close(pfd[0]);
237 break;
238 default: /* parent */
239 close(pfd[1]);
240 if (read(pfd[0], &status, sizeof(status)) != sizeof(status)) {
241 wait(&status);
242 status = WEXITSTATUS(status);
243 }
244 exit(status);
9fe32043 245 }
16ddab0d
DW
246 } else
247 pfd[0] = pfd[1] = -1;
549e9569
NB
248 /* hopefully it is a container - we'll check later */
249
250 container = malloc(sizeof(*container));
549e9569
NB
251 container->devnum = fd2devnum(mdfd);
252 container->devname = devnum2devname(container->devnum);
e0d6609f 253 container->device_name = argv[1];
549e9569
NB
254
255 /* If this fails, we hope it already exists */
256 mkdir("/var/run/mdadm", 0600);
257 /* pid file lives in /var/run/mdadm/mdXX.pid */
b109d928
DW
258 if (make_pidfile(container->devname, O_EXCL) < 0) {
259 if (ping_monitor(container->devname) == 0) {
260 fprintf(stderr, "mdmon: %s already managed\n",
261 container->devname);
262 exit(3);
263 } else {
295646b3
DW
264 int err;
265
b109d928
DW
266 /* cleanup the old monitor, this one is taking over */
267 try_kill_monitor(container->devname);
295646b3
DW
268 err = make_pidfile(container->devname, 0);
269 if (err < 0) {
b109d928
DW
270 fprintf(stderr, "mdmon: %s Cannot create pidfile\n",
271 container->devname);
295646b3
DW
272 if (err == -EROFS) {
273 /* FIXME implement a mechanism to
274 * prevent duplicate monitor instances
275 */
276 fprintf(stderr,
277 "mdmon: continuing on read-only file system\n");
278 } else
279 exit(3);
b109d928
DW
280 }
281 }
549e9569
NB
282 }
283
284 container->sock = make_control_sock(container->devname);
549e9569
NB
285 container->arrays = NULL;
286
287 mdi = sysfs_read(mdfd, container->devnum,
288 GET_VERSION|GET_LEVEL|GET_DEVS);
289
290 if (!mdi) {
291 fprintf(stderr, "mdmon: failed to load sysfs info for %s\n",
292 container->devname);
293 exit(3);
294 }
295 if (mdi->array.level != UnSet) {
296 fprintf(stderr, "mdmon: %s is not a container - cannot monitor\n",
297 argv[1]);
298 exit(3);
299 }
300 if (mdi->array.major_version != -1 ||
301 mdi->array.minor_version != -2) {
302 fprintf(stderr, "mdmon: %s does not use external metadata - cannot monitor\n",
303 argv[1]);
304 exit(3);
305 }
306
307 container->ss = find_metadata_methods(mdi->text_version);
308 if (container->ss == NULL) {
309 fprintf(stderr, "mdmon: %s uses unknown metadata: %s\n",
310 argv[1], mdi->text_version);
311 exit(3);
312 }
313
314 container->devs = NULL;
315 for (di = mdi->devs; di; di = di->next) {
316 struct mdinfo *cd = malloc(sizeof(*cd));
7bc1962f 317 *cd = *di;
549e9569
NB
318 cd->next = container->devs;
319 container->devs = cd;
320 }
321 sysfs_free(mdi);
322
323
324 if (container->ss->load_super(container, mdfd, argv[1])) {
325 fprintf(stderr, "mdmon: Cannot load metadata for %s\n",
326 argv[1]);
327 exit(3);
328 }
549e9569 329
9fe32043
N
330 /* Ok, this is close enough. We can say goodbye to our parent now.
331 */
332 status = 0;
3d2c4fc7
DW
333 if (write(pfd[1], &status, sizeof(status)) < 0)
334 fprintf(stderr, "mdmon: failed to notify our parent: %d\n",
335 getppid());
9fe32043
N
336 close(pfd[1]);
337
3d2c4fc7 338 ignore = chdir("/");
9fe32043
N
339 setsid();
340 close(0);
341 open("/dev/null", O_RDWR);
342 close(1);
3d2c4fc7 343 ignore = dup(0);
9fe32043
N
344#ifndef DEBUG
345 close(2);
3d2c4fc7 346 ignore = dup(0);
9fe32043
N
347#endif
348
549e9569
NB
349 mlockall(MCL_FUTURE);
350
4d43913c
NB
351 /* SIGUSR is sent between parent and child. So both block it
352 * and enable it only with pselect.
353 */
354 sigemptyset(&set);
355 sigaddset(&set, SIGUSR1);
295646b3 356 sigaddset(&set, SIGHUP);
695154b2 357 sigaddset(&set, SIGALRM);
4d43913c 358 sigprocmask(SIG_BLOCK, &set, NULL);
bfa44e2e
NB
359 act.sa_handler = wake_me;
360 act.sa_flags = 0;
361 sigaction(SIGUSR1, &act, NULL);
695154b2 362 sigaction(SIGALRM, &act, NULL);
295646b3
DW
363 act.sa_handler = hup;
364 sigaction(SIGHUP, &act, NULL);
bfa44e2e
NB
365 act.sa_handler = SIG_IGN;
366 sigaction(SIGPIPE, &act, NULL);
4d43913c 367
3e70c845 368 if (clone_monitor(container) < 0) {
295646b3 369 fprintf(stderr, "mdmon: failed to start monitor process: %s\n",
549e9569
NB
370 strerror(errno));
371 exit(2);
372 }
373
374 do_manager(container);
375
376 exit(0);
377}