2 * mdadm - manage Linux "md" devices aka RAID arrays.
4 * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 * Email: <neilb@suse.de>
28 /* To restripe, we read from old geometry to a buffer, and
29 * read from buffer to new geometry.
30 * When reading, we might have missing devices and so could need
32 * When writing, we need to create correct parity and Q.
36 static int geo_map(int block
, unsigned long long stripe
, int raid_disks
,
37 int level
, int layout
)
39 /* On the given stripe, find which disk in the array will have
40 * block numbered 'block'.
41 * '-1' means the parity block.
42 * '-2' means the Q syndrome.
46 /* layout is not relevant for raid0 and raid4 */
51 switch(level
*100 + layout
) {
54 case 500 + ALGORITHM_PARITY_N
:
55 /* raid 4 isn't messed around by parity blocks */
57 return raid_disks
-1; /* parity block */
59 case 500 + ALGORITHM_LEFT_ASYMMETRIC
:
60 pd
= (raid_disks
-1) - stripe
% raid_disks
;
61 if (block
== -1) return pd
;
66 case 500 + ALGORITHM_RIGHT_ASYMMETRIC
:
67 pd
= stripe
% raid_disks
;
68 if (block
== -1) return pd
;
73 case 500 + ALGORITHM_LEFT_SYMMETRIC
:
74 pd
= (raid_disks
- 1) - stripe
% raid_disks
;
75 if (block
== -1) return pd
;
76 return (pd
+ 1 + block
) % raid_disks
;
78 case 500 + ALGORITHM_RIGHT_SYMMETRIC
:
79 pd
= stripe
% raid_disks
;
80 if (block
== -1) return pd
;
81 return (pd
+ 1 + block
) % raid_disks
;
83 case 500 + ALGORITHM_PARITY_0
:
87 case 600 + ALGORITHM_PARITY_N_6
:
89 return raid_disks
- 1;
91 return raid_disks
- 2; /* parity block */
93 case 600 + ALGORITHM_LEFT_ASYMMETRIC_6
:
95 return raid_disks
- 1;
97 pd
= (raid_disks
-1) - stripe
% raid_disks
;
98 if (block
== -1) return pd
;
103 case 600 + ALGORITHM_RIGHT_ASYMMETRIC_6
:
105 return raid_disks
- 1;
107 pd
= stripe
% raid_disks
;
108 if (block
== -1) return pd
;
113 case 600 + ALGORITHM_LEFT_SYMMETRIC_6
:
115 return raid_disks
- 1;
117 pd
= (raid_disks
- 1) - stripe
% raid_disks
;
118 if (block
== -1) return pd
;
119 return (pd
+ 1 + block
) % raid_disks
;
121 case 600 + ALGORITHM_RIGHT_SYMMETRIC_6
:
123 return raid_disks
- 1;
125 pd
= stripe
% raid_disks
;
126 if (block
== -1) return pd
;
127 return (pd
+ 1 + block
) % raid_disks
;
129 case 600 + ALGORITHM_PARITY_0_6
:
131 return raid_disks
- 1;
135 case 600 + ALGORITHM_PARITY_0
:
142 case 600 + ALGORITHM_LEFT_ASYMMETRIC
:
143 pd
= raid_disks
- 1 - (stripe
% raid_disks
);
144 if (block
== -1) return pd
;
145 if (block
== -2) return (pd
+1) % raid_disks
;
146 if (pd
== raid_disks
- 1)
152 case 600 + ALGORITHM_ROTATING_ZERO_RESTART
:
153 /* Different order for calculating Q, otherwize same as ... */
154 case 600 + ALGORITHM_RIGHT_ASYMMETRIC
:
155 pd
= stripe
% raid_disks
;
156 if (block
== -1) return pd
;
157 if (block
== -2) return (pd
+1) % raid_disks
;
158 if (pd
== raid_disks
- 1)
164 case 600 + ALGORITHM_LEFT_SYMMETRIC
:
165 pd
= raid_disks
- 1 - (stripe
% raid_disks
);
166 if (block
== -1) return pd
;
167 if (block
== -2) return (pd
+1) % raid_disks
;
168 return (pd
+ 2 + block
) % raid_disks
;
170 case 600 + ALGORITHM_RIGHT_SYMMETRIC
:
171 pd
= stripe
% raid_disks
;
172 if (block
== -1) return pd
;
173 if (block
== -2) return (pd
+1) % raid_disks
;
174 return (pd
+ 2 + block
) % raid_disks
;
177 case 600 + ALGORITHM_ROTATING_N_RESTART
:
178 /* Same a left_asymmetric, by first stripe is
179 * D D D P Q rather than
182 pd
= raid_disks
- 1 - ((stripe
+ 1) % raid_disks
);
183 if (block
== -1) return pd
;
184 if (block
== -2) return (pd
+1) % raid_disks
;
185 if (pd
== raid_disks
- 1)
191 case 600 + ALGORITHM_ROTATING_N_CONTINUE
:
192 /* Same as left_symmetric but Q is before P */
193 pd
= raid_disks
- 1 - (stripe
% raid_disks
);
194 if (block
== -1) return pd
;
195 if (block
== -2) return (pd
+raid_disks
-1) % raid_disks
;
196 return (pd
+ 1 + block
) % raid_disks
;
200 static int is_ddf(int layout
)
206 case ALGORITHM_ROTATING_N_CONTINUE
:
207 case ALGORITHM_ROTATING_N_RESTART
:
208 case ALGORITHM_ROTATING_ZERO_RESTART
:
214 static void xor_blocks(char *target
, char **sources
, int disks
, int size
)
217 /* Amazingly inefficient... */
218 for (i
=0; i
<size
; i
++) {
220 for (j
=0 ; j
<disks
; j
++)
226 static void qsyndrome(uint8_t *p
, uint8_t *q
, uint8_t **sources
, int disks
, int size
)
229 uint8_t wq0
, wp0
, wd0
, w10
, w20
;
230 for ( d
= 0; d
< size
; d
++) {
231 wq0
= wp0
= sources
[disks
-1][d
];
232 for ( z
= disks
-2 ; z
>= 0 ; z
-- ) {
235 w20
= (wq0
&0x80) ? 0xff : 0x00;
236 w10
= (wq0
<< 1) & 0xff;
248 * The following was taken from linux/drivers/md/mktables.c, and modified
249 * to create in-memory tables rather than C code
251 static uint8_t gfmul(uint8_t a
, uint8_t b
)
258 a
= (a
<< 1) ^ (a
& 0x80 ? 0x1d : 0);
265 static uint8_t gfpow(uint8_t a
, int b
)
283 int tables_ready
= 0;
284 uint8_t raid6_gfmul
[256][256];
285 uint8_t raid6_gfexp
[256];
286 uint8_t raid6_gfinv
[256];
287 uint8_t raid6_gfexi
[256];
288 void make_tables(void)
293 /* Compute multiplication table */
294 for (i
= 0; i
< 256; i
++)
295 for (j
= 0; j
< 256; j
++)
296 raid6_gfmul
[i
][j
] = gfmul(i
, j
);
298 /* Compute power-of-2 table (exponent) */
300 for (i
= 0; i
< 256; i
++) {
304 v
= 0; /* For entry 255, not a real entry */
307 /* Compute inverse table x^-1 == x^254 */
308 for (i
= 0; i
< 256; i
++)
309 raid6_gfinv
[i
] = gfpow(i
, 254);
311 /* Compute inv(2^x + 1) (exponent-xor-inverse) table */
312 for (i
= 0; i
< 256; i
++)
313 raid6_gfexi
[i
] = raid6_gfinv
[raid6_gfexp
[i
] ^ 1];
319 /* Following was taken from linux/drivers/md/raid6recov.c */
321 /* Recover two failed data blocks. */
322 void raid6_2data_recov(int disks
, size_t bytes
, int faila
, int failb
,
325 uint8_t *p
, *q
, *dp
, *dq
;
327 const uint8_t *pbmul
; /* P multiplier table for B data */
328 const uint8_t *qmul
; /* Q multiplier table (for both) */
333 /* Compute syndrome with zero for the missing data pages
334 Use the dead data pages as temporary storage for
335 delta p and delta q */
341 qsyndrome(dp
, dq
, ptrs
, disks
-2, bytes
);
343 /* Restore pointer table */
347 /* Now, pick the proper data tables */
348 pbmul
= raid6_gfmul
[raid6_gfexi
[failb
-faila
]];
349 qmul
= raid6_gfmul
[raid6_gfinv
[raid6_gfexp
[faila
]^raid6_gfexp
[failb
]]];
355 *dq
++ = db
= pbmul
[px
] ^ qx
; /* Reconstructed B */
356 *dp
++ = db
^ px
; /* Reconstructed A */
361 /* Recover failure of one data block plus the P block */
362 void raid6_datap_recov(int disks
, size_t bytes
, int faila
, uint8_t **ptrs
)
365 const uint8_t *qmul
; /* Q multiplier table */
370 /* Compute syndrome with zero for the missing data page
371 Use the dead data page as temporary storage for delta q */
375 qsyndrome(p
, dq
, ptrs
, disks
-2, bytes
);
377 /* Restore pointer table */
380 /* Now, pick the proper data tables */
381 qmul
= raid6_gfmul
[raid6_gfinv
[raid6_gfexp
[faila
]]];
385 *p
++ ^= *dq
= qmul
[*q
^ *dq
];
392 * A list of 'fds' of the active disks. Some may be absent.
393 * A geometry: raid_disks, chunk_size, level, layout
394 * A list of 'fds' for mirrored targets. They are already seeked to
395 * right (Write) location
396 * A start and length which must be stripe-aligned
397 * 'buf' is large enough to hold one stripe, and is aligned
400 int save_stripes(int *source
, unsigned long long *offsets
,
401 int raid_disks
, int chunk_size
, int level
, int layout
,
402 int nwrites
, int *dest
,
403 unsigned long long start
, unsigned long long length
,
407 int data_disks
= raid_disks
- (level
== 0 ? 0 : level
<=5 ? 1 : 2);
415 zero
= malloc(chunk_size
);
416 memset(zero
, 0, chunk_size
);
419 len
= data_disks
* chunk_size
;
422 int fdisk
[3], fblock
[3];
423 for (disk
= 0; disk
< raid_disks
; disk
++) {
424 unsigned long long offset
;
427 offset
= (start
/chunk_size
/data_disks
)*chunk_size
;
428 dnum
= geo_map(disk
< data_disks
? disk
: data_disks
- disk
- 1,
429 start
/chunk_size
/data_disks
,
430 raid_disks
, level
, layout
);
431 if (dnum
< 0) abort();
432 if (source
[dnum
] < 0 ||
433 lseek64(source
[dnum
], offsets
[dnum
]+offset
, 0) < 0 ||
434 read(source
[dnum
], buf
+disk
* chunk_size
, chunk_size
)
437 fdisk
[failed
] = dnum
;
438 fblock
[failed
] = disk
;
442 if (failed
== 0 || fblock
[0] >= data_disks
)
443 /* all data disks are good */
445 else if (failed
== 1 || fblock
[1] >= data_disks
+1) {
446 /* one failed data disk and good parity */
447 char *bufs
[data_disks
];
448 for (i
=0; i
< data_disks
; i
++)
450 bufs
[i
] = buf
+ data_disks
*chunk_size
;
452 bufs
[i
] = buf
+ i
*chunk_size
;
454 xor_blocks(buf
+ fblock
[0]*chunk_size
,
455 bufs
, data_disks
, chunk_size
);
456 } else if (failed
> 2 || level
!= 6)
457 /* too much failure */
460 /* RAID6 computations needed. */
461 uint8_t *bufs
[data_disks
+4];
464 disk
= geo_map(-1, start
/chunk_size
/data_disks
,
465 raid_disks
, level
, layout
);
466 qdisk
= geo_map(-2, start
/chunk_size
/data_disks
,
467 raid_disks
, level
, layout
);
468 if (is_ddf(layout
)) {
469 /* q over 'raid_disks' blocks, in device order.
470 * 'p' and 'q' get to be all zero
472 for (i
= 0; i
< raid_disks
; i
++)
474 for (i
= 0; i
< data_disks
; i
++) {
475 int dnum
= geo_map(i
,
476 start
/chunk_size
/data_disks
,
477 raid_disks
, level
, layout
);
479 /* i is the logical block number, so is index to 'buf'.
480 * dnum is physical disk number
481 * and thus the syndrome number.
484 bufs
[snum
] = (uint8_t*)buf
+ chunk_size
* i
;
486 syndrome_disks
= raid_disks
;
488 /* for md, q is over 'data_disks' blocks,
489 * starting immediately after 'q'
490 * Note that for the '_6' variety, the p block
491 * makes a hole that we need to be careful of.
495 for (j
= 0; j
< raid_disks
; j
++) {
496 int dnum
= (qdisk
+ 1 + j
) % raid_disks
;
497 if (dnum
== disk
|| dnum
== qdisk
)
499 for (i
= 0; i
< data_disks
; i
++)
501 start
/chunk_size
/data_disks
,
502 raid_disks
, level
, layout
) == dnum
)
504 /* i is the logical block number, so is index to 'buf'.
505 * dnum is physical disk number
506 * snum is syndrome disk for which 0 is immediately after Q
508 bufs
[snum
] = (uint8_t*)buf
+ chunk_size
* i
;
517 syndrome_disks
= data_disks
;
520 /* Place P and Q blocks at end of bufs */
521 bufs
[syndrome_disks
] = (uint8_t*)buf
+ chunk_size
* data_disks
;
522 bufs
[syndrome_disks
+1] = (uint8_t*)buf
+ chunk_size
* (data_disks
+1);
524 if (fblock
[1] == data_disks
)
525 /* One data failed, and parity failed */
526 raid6_datap_recov(syndrome_disks
+2, chunk_size
,
529 if (fdisk
[0] > fdisk
[1]) {
534 /* Two data blocks failed, P,Q OK */
535 raid6_2data_recov(syndrome_disks
+2, chunk_size
,
536 fdisk
[0], fdisk
[1], bufs
);
540 for (i
=0; i
<nwrites
; i
++)
541 if (write(dest
[i
], buf
, len
) != len
)
552 * A list of 'fds' of the active disks. Some may be '-1' for not-available.
553 * A geometry: raid_disks, chunk_size, level, layout
554 * An 'fd' to read from. It is already seeked to the right (Read) location.
555 * A start and length.
556 * The length must be a multiple of the stripe size.
558 * We build a full stripe in memory and then write it out.
559 * We assume that there are enough working devices.
561 int restore_stripes(int *dest
, unsigned long long *offsets
,
562 int raid_disks
, int chunk_size
, int level
, int layout
,
563 int source
, unsigned long long read_offset
,
564 unsigned long long start
, unsigned long long length
)
567 char **stripes
= malloc(raid_disks
* sizeof(char*));
568 char **blocks
= malloc(raid_disks
* sizeof(char*));
571 int data_disks
= raid_disks
- (level
== 0 ? 0 : level
<= 5 ? 1 : 2);
573 if (posix_memalign((void**)&stripe_buf
, 4096, raid_disks
* chunk_size
))
576 zero
= malloc(chunk_size
);
578 memset(zero
, 0, chunk_size
);
580 if (stripe_buf
== NULL
|| stripes
== NULL
|| blocks
== NULL
588 for (i
=0; i
<raid_disks
; i
++)
589 stripes
[i
] = stripe_buf
+ i
* chunk_size
;
591 unsigned int len
= data_disks
* chunk_size
;
592 unsigned long long offset
;
597 for (i
=0; i
< data_disks
; i
++) {
598 int disk
= geo_map(i
, start
/chunk_size
/data_disks
,
599 raid_disks
, level
, layout
);
600 if ((unsigned long long)lseek64(source
, read_offset
, 0)
603 if (read(source
, stripes
[disk
],
604 chunk_size
) != chunk_size
)
606 read_offset
+= chunk_size
;
608 /* We have the data, now do the parity */
609 offset
= (start
/chunk_size
/data_disks
) * chunk_size
;
613 disk
= geo_map(-1, start
/chunk_size
/data_disks
,
614 raid_disks
, level
, layout
);
615 for (i
= 0; i
< data_disks
; i
++)
616 blocks
[i
] = stripes
[(disk
+1+i
) % raid_disks
];
617 xor_blocks(stripes
[disk
], blocks
, data_disks
, chunk_size
);
620 disk
= geo_map(-1, start
/chunk_size
/data_disks
,
621 raid_disks
, level
, layout
);
622 qdisk
= geo_map(-2, start
/chunk_size
/data_disks
,
623 raid_disks
, level
, layout
);
624 if (is_ddf(layout
)) {
625 /* q over 'raid_disks' blocks, in device order.
626 * 'p' and 'q' get to be all zero
628 for (i
= 0; i
< raid_disks
; i
++)
629 if (i
== disk
|| i
== qdisk
)
630 blocks
[i
] = (char*)zero
;
632 blocks
[i
] = stripes
[i
];
633 syndrome_disks
= raid_disks
;
635 /* for md, q is over 'data_disks' blocks,
636 * starting immediately after 'q'
638 for (i
= 0; i
< data_disks
; i
++)
639 blocks
[i
] = stripes
[(qdisk
+1+i
) % raid_disks
];
641 syndrome_disks
= data_disks
;
643 qsyndrome((uint8_t*)stripes
[disk
],
644 (uint8_t*)stripes
[qdisk
],
646 syndrome_disks
, chunk_size
);
649 for (i
=0; i
< raid_disks
; i
++)
651 if (lseek64(dest
[i
], offsets
[i
]+offset
, 0) < 0)
653 if (write(dest
[i
], stripes
[i
], chunk_size
) != chunk_size
)
664 int test_stripes(int *source
, unsigned long long *offsets
,
665 int raid_disks
, int chunk_size
, int level
, int layout
,
666 unsigned long long start
, unsigned long long length
)
668 /* ready the data and p (and q) blocks, and check we got them right */
669 char *stripe_buf
= malloc(raid_disks
* chunk_size
);
670 char **stripes
= malloc(raid_disks
* sizeof(char*));
671 char **blocks
= malloc(raid_disks
* sizeof(char*));
672 char *p
= malloc(chunk_size
);
673 char *q
= malloc(chunk_size
);
676 int data_disks
= raid_disks
- (level
== 5 ? 1: 2);
677 for ( i
= 0 ; i
< raid_disks
; i
++)
678 stripes
[i
] = stripe_buf
+ i
* chunk_size
;
683 for (i
= 0 ; i
< raid_disks
; i
++) {
684 lseek64(source
[i
], offsets
[i
]+start
, 0);
685 read(source
[i
], stripes
[i
], chunk_size
);
687 for (i
= 0 ; i
< data_disks
; i
++) {
688 int disk
= geo_map(i
, start
/chunk_size
, raid_disks
,
690 blocks
[i
] = stripes
[disk
];
691 printf("%d->%d\n", i
, disk
);
695 qsyndrome(p
, q
, (uint8_t**)blocks
, data_disks
, chunk_size
);
696 disk
= geo_map(-1, start
/chunk_size
, raid_disks
,
698 if (memcmp(p
, stripes
[disk
], chunk_size
) != 0) {
699 printf("P(%d) wrong at %llu\n", disk
,
702 disk
= geo_map(-2, start
/chunk_size
, raid_disks
,
704 if (memcmp(q
, stripes
[disk
], chunk_size
) != 0) {
705 printf("Q(%d) wrong at %llu\n", disk
,
710 length
-= chunk_size
;
716 unsigned long long getnum(char *str
, char **err
)
719 unsigned long long rv
= strtoull(str
, &e
, 10);
727 main(int argc
, char *argv
[])
729 /* save/restore file raid_disks chunk_size level layout start length devices...
736 unsigned long long *offsets
;
737 int raid_disks
, chunk_size
, level
, layout
;
738 unsigned long long start
, length
;
743 fprintf(stderr
, "Usage: test_stripe save/restore file raid_disks"
744 " chunk_size level layout start length devices...\n");
747 if (strcmp(argv
[1], "save")==0)
749 else if (strcmp(argv
[1], "restore") == 0)
751 else if (strcmp(argv
[1], "test") == 0)
754 fprintf(stderr
, "test_stripe: must give 'save' or 'restore'.\n");
759 raid_disks
= getnum(argv
[3], &err
);
760 chunk_size
= getnum(argv
[4], &err
);
761 level
= getnum(argv
[5], &err
);
762 layout
= getnum(argv
[6], &err
);
763 start
= getnum(argv
[7], &err
);
764 length
= getnum(argv
[8], &err
);
766 fprintf(stderr
, "test_stripe: Bad number: %s\n", err
);
769 if (argc
!= raid_disks
+ 9) {
770 fprintf(stderr
, "test_stripe: wrong number of devices: want %d found %d\n",
774 fds
= malloc(raid_disks
* sizeof(*fds
));
775 offsets
= malloc(raid_disks
* sizeof(*offsets
));
776 memset(offsets
, 0, raid_disks
* sizeof(*offsets
));
778 storefd
= open(file
, O_RDWR
);
781 fprintf(stderr
, "test_stripe: could not open %s.\n", file
);
784 for (i
=0; i
<raid_disks
; i
++) {
785 fds
[i
] = open(argv
[9+i
], O_RDWR
);
788 fprintf(stderr
,"test_stripe: cannot open %s.\n", argv
[9+i
]);
793 buf
= malloc(raid_disks
* chunk_size
);
796 int rv
= save_stripes(fds
, offsets
,
797 raid_disks
, chunk_size
, level
, layout
,
802 "test_stripe: save_stripes returned %d\n", rv
);
805 } else if (save
== 2) {
806 int rv
= test_stripes(fds
, offsets
,
807 raid_disks
, chunk_size
, level
, layout
,
811 "test_stripe: test_stripes returned %d\n", rv
);
815 int rv
= restore_stripes(fds
, offsets
,
816 raid_disks
, chunk_size
, level
, layout
,
821 "test_stripe: restore_stripes returned %d\n",