2 * mdadm - manage Linux "md" devices aka RAID arrays.
4 * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 * Email: <neilb@suse.de>
28 /* To restripe, we read from old geometry to a buffer, and
29 * read from buffer to new geometry.
30 * When reading, we might have missing devices and so could need
32 * When writing, we need to create correct parity and Q.
36 static int geo_map(int block
, unsigned long long stripe
, int raid_disks
,
37 int level
, int layout
)
39 /* On the given stripe, find which disk in the array will have
40 * block numbered 'block'.
41 * '-1' means the parity block.
42 * '-2' means the Q syndrome.
46 switch(level
*100 + layout
) {
49 case 500 + ALGORITHM_PARITY_N
:
50 /* raid 4 isn't messed around by parity blocks */
52 return raid_disks
-1; /* parity block */
54 case 500 + ALGORITHM_LEFT_ASYMMETRIC
:
55 pd
= (raid_disks
-1) - stripe
% raid_disks
;
56 if (block
== -1) return pd
;
61 case 500 + ALGORITHM_RIGHT_ASYMMETRIC
:
62 pd
= stripe
% raid_disks
;
63 if (block
== -1) return pd
;
68 case 500 + ALGORITHM_LEFT_SYMMETRIC
:
69 pd
= (raid_disks
- 1) - stripe
% raid_disks
;
70 if (block
== -1) return pd
;
71 return (pd
+ 1 + block
) % raid_disks
;
73 case 500 + ALGORITHM_RIGHT_SYMMETRIC
:
74 pd
= stripe
% raid_disks
;
75 if (block
== -1) return pd
;
76 return (pd
+ 1 + block
) % raid_disks
;
78 case 500 + ALGORITHM_PARITY_0
:
82 case 600 + ALGORITHM_PARITY_N_6
:
84 return raid_disks
- 1;
86 return raid_disks
- 2; /* parity block */
88 case 600 + ALGORITHM_LEFT_ASYMMETRIC_6
:
90 return raid_disks
- 1;
92 pd
= (raid_disks
-1) - stripe
% raid_disks
;
93 if (block
== -1) return pd
;
98 case 600 + ALGORITHM_RIGHT_ASYMMETRIC_6
:
100 return raid_disks
- 1;
102 pd
= stripe
% raid_disks
;
103 if (block
== -1) return pd
;
108 case 600 + ALGORITHM_LEFT_SYMMETRIC_6
:
110 return raid_disks
- 1;
112 pd
= (raid_disks
- 1) - stripe
% raid_disks
;
113 if (block
== -1) return pd
;
114 return (pd
+ 1 + block
) % raid_disks
;
116 case 600 + ALGORITHM_RIGHT_SYMMETRIC_6
:
118 return raid_disks
- 1;
120 pd
= stripe
% raid_disks
;
121 if (block
== -1) return pd
;
122 return (pd
+ 1 + block
) % raid_disks
;
124 case 600 + ALGORITHM_PARITY_0_6
:
126 return raid_disks
- 1;
130 case 600 + ALGORITHM_PARITY_0
:
137 case 600 + ALGORITHM_LEFT_ASYMMETRIC
:
138 pd
= raid_disks
- 1 - (stripe
% raid_disks
);
139 if (block
== -1) return pd
;
140 if (block
== -2) return (pd
+1) % raid_disks
;
141 if (pd
== raid_disks
- 1)
147 case 600 + ALGORITHM_ROTATING_ZERO_RESTART
:
148 /* Different order for calculating Q, otherwize same as ... */
149 case 600 + ALGORITHM_RIGHT_ASYMMETRIC
:
150 pd
= stripe
% raid_disks
;
151 if (block
== -1) return pd
;
152 if (block
== -2) return (pd
+1) % raid_disks
;
153 if (pd
== raid_disks
- 1)
159 case 600 + ALGORITHM_LEFT_SYMMETRIC
:
160 pd
= raid_disks
- 1 - (stripe
% raid_disks
);
161 if (block
== -1) return pd
;
162 if (block
== -2) return (pd
+1) % raid_disks
;
163 return (pd
+ 2 + block
) % raid_disks
;
165 case 600 + ALGORITHM_RIGHT_SYMMETRIC
:
166 pd
= stripe
% raid_disks
;
167 if (block
== -1) return pd
;
168 if (block
== -2) return (pd
+1) % raid_disks
;
169 return (pd
+ 2 + block
) % raid_disks
;
172 case 600 + ALGORITHM_ROTATING_N_RESTART
:
173 /* Same a left_asymmetric, by first stripe is
174 * D D D P Q rather than
177 pd
= raid_disks
- 1 - ((stripe
+ 1) % raid_disks
);
178 if (block
== -1) return pd
;
179 if (block
== -2) return (pd
+1) % raid_disks
;
180 if (pd
== raid_disks
- 1)
186 case 600 + ALGORITHM_ROTATING_N_CONTINUE
:
187 /* Same as left_symmetric but Q is before P */
188 pd
= raid_disks
- 1 - (stripe
% raid_disks
);
189 if (block
== -1) return pd
;
190 if (block
== -2) return (pd
+raid_disks
-1) % raid_disks
;
191 return (pd
+ 1 + block
) % raid_disks
;
195 static int is_ddf(int layout
)
201 case ALGORITHM_ROTATING_N_CONTINUE
:
202 case ALGORITHM_ROTATING_N_RESTART
:
203 case ALGORITHM_ROTATING_ZERO_RESTART
:
209 static void xor_blocks(char *target
, char **sources
, int disks
, int size
)
212 /* Amazingly inefficient... */
213 for (i
=0; i
<size
; i
++) {
215 for (j
=0 ; j
<disks
; j
++)
221 static void qsyndrome(uint8_t *p
, uint8_t *q
, uint8_t **sources
, int disks
, int size
)
224 uint8_t wq0
, wp0
, wd0
, w10
, w20
;
225 for ( d
= 0; d
< size
; d
++) {
226 wq0
= wp0
= sources
[disks
-1][d
];
227 for ( z
= disks
-2 ; z
>= 0 ; z
-- ) {
230 w20
= (wq0
&0x80) ? 0xff : 0x00;
231 w10
= (wq0
<< 1) & 0xff;
243 * The following was taken from linux/drivers/md/mktables.c, and modified
244 * to create in-memory tables rather than C code
246 static uint8_t gfmul(uint8_t a
, uint8_t b
)
253 a
= (a
<< 1) ^ (a
& 0x80 ? 0x1d : 0);
260 static uint8_t gfpow(uint8_t a
, int b
)
278 int tables_ready
= 0;
279 uint8_t raid6_gfmul
[256][256];
280 uint8_t raid6_gfexp
[256];
281 uint8_t raid6_gfinv
[256];
282 uint8_t raid6_gfexi
[256];
283 void make_tables(void)
288 /* Compute multiplication table */
289 for (i
= 0; i
< 256; i
++)
290 for (j
= 0; j
< 256; j
++)
291 raid6_gfmul
[i
][j
] = gfmul(i
, j
);
293 /* Compute power-of-2 table (exponent) */
295 for (i
= 0; i
< 256; i
++) {
299 v
= 0; /* For entry 255, not a real entry */
302 /* Compute inverse table x^-1 == x^254 */
303 for (i
= 0; i
< 256; i
++)
304 raid6_gfinv
[i
] = gfpow(i
, 254);
306 /* Compute inv(2^x + 1) (exponent-xor-inverse) table */
307 for (i
= 0; i
< 256; i
++)
308 raid6_gfexi
[i
] = raid6_gfinv
[raid6_gfexp
[i
] ^ 1];
314 /* Following was taken from linux/drivers/md/raid6recov.c */
316 /* Recover two failed data blocks. */
317 void raid6_2data_recov(int disks
, size_t bytes
, int faila
, int failb
,
320 uint8_t *p
, *q
, *dp
, *dq
;
322 const uint8_t *pbmul
; /* P multiplier table for B data */
323 const uint8_t *qmul
; /* Q multiplier table (for both) */
328 /* Compute syndrome with zero for the missing data pages
329 Use the dead data pages as temporary storage for
330 delta p and delta q */
336 qsyndrome(dp
, dq
, ptrs
, disks
-2, bytes
);
338 /* Restore pointer table */
342 /* Now, pick the proper data tables */
343 pbmul
= raid6_gfmul
[raid6_gfexi
[failb
-faila
]];
344 qmul
= raid6_gfmul
[raid6_gfinv
[raid6_gfexp
[faila
]^raid6_gfexp
[failb
]]];
350 *dq
++ = db
= pbmul
[px
] ^ qx
; /* Reconstructed B */
351 *dp
++ = db
^ px
; /* Reconstructed A */
356 /* Recover failure of one data block plus the P block */
357 void raid6_datap_recov(int disks
, size_t bytes
, int faila
, uint8_t **ptrs
)
360 const uint8_t *qmul
; /* Q multiplier table */
365 /* Compute syndrome with zero for the missing data page
366 Use the dead data page as temporary storage for delta q */
370 qsyndrome(p
, dq
, ptrs
, disks
-2, bytes
);
372 /* Restore pointer table */
375 /* Now, pick the proper data tables */
376 qmul
= raid6_gfmul
[raid6_gfinv
[raid6_gfexp
[faila
]]];
380 *p
++ ^= *dq
= qmul
[*q
^ *dq
];
387 * A list of 'fds' of the active disks. Some may be absent.
388 * A geometry: raid_disks, chunk_size, level, layout
389 * A list of 'fds' for mirrored targets. They are already seeked to
390 * right (Write) location
391 * A start and length which must be stripe-aligned
392 * 'buf' is large enough to hold one stripe, and is aligned
395 int save_stripes(int *source
, unsigned long long *offsets
,
396 int raid_disks
, int chunk_size
, int level
, int layout
,
397 int nwrites
, int *dest
,
398 unsigned long long start
, unsigned long long length
,
402 int data_disks
= raid_disks
- (level
== 0 ? 0 : level
<=5 ? 1 : 2);
410 zero
= malloc(chunk_size
);
411 memset(zero
, 0, chunk_size
);
414 len
= data_disks
* chunk_size
;
417 int fdisk
[3], fblock
[3];
418 for (disk
= 0; disk
< raid_disks
; disk
++) {
419 unsigned long long offset
;
422 offset
= (start
/chunk_size
/data_disks
)*chunk_size
;
423 dnum
= geo_map(disk
< data_disks
? disk
: data_disks
- disk
- 1,
424 start
/chunk_size
/data_disks
,
425 raid_disks
, level
, layout
);
426 if (dnum
< 0) abort();
427 if (source
[dnum
] < 0 ||
428 lseek64(source
[dnum
], offsets
[dnum
]+offset
, 0) < 0 ||
429 read(source
[dnum
], buf
+disk
* chunk_size
, chunk_size
)
432 fdisk
[failed
] = dnum
;
433 fblock
[failed
] = disk
;
437 if (failed
== 0 || fblock
[0] >= data_disks
)
438 /* all data disks are good */
440 else if (failed
== 1 || fblock
[1] >= data_disks
+1) {
441 /* one failed data disk and good parity */
442 char *bufs
[data_disks
];
443 for (i
=0; i
< data_disks
; i
++)
445 bufs
[i
] = buf
+ data_disks
*chunk_size
;
447 bufs
[i
] = buf
+ i
*chunk_size
;
449 xor_blocks(buf
+ fblock
[0]*chunk_size
,
450 bufs
, data_disks
, chunk_size
);
451 } else if (failed
> 2 || level
!= 6)
452 /* too much failure */
455 /* RAID6 computations needed. */
456 uint8_t *bufs
[data_disks
+4];
459 disk
= geo_map(-1, start
/chunk_size
/data_disks
,
460 raid_disks
, level
, layout
);
461 qdisk
= geo_map(-2, start
/chunk_size
/data_disks
,
462 raid_disks
, level
, layout
);
463 if (is_ddf(layout
)) {
464 /* q over 'raid_disks' blocks, in device order.
465 * 'p' and 'q' get to be all zero
467 for (i
= 0; i
< raid_disks
; i
++)
469 for (i
= 0; i
< data_disks
; i
++) {
470 int dnum
= geo_map(i
,
471 start
/chunk_size
/data_disks
,
472 raid_disks
, level
, layout
);
474 /* i is the logical block number, so is index to 'buf'.
475 * dnum is physical disk number
476 * and thus the syndrome number.
479 bufs
[snum
] = (uint8_t*)buf
+ chunk_size
* i
;
481 syndrome_disks
= raid_disks
;
483 /* for md, q is over 'data_disks' blocks,
484 * starting immediately after 'q'
485 * Note that for the '_6' variety, the p block
486 * makes a hole that we need to be careful of.
490 for (j
= 0; j
< raid_disks
; j
++) {
491 int dnum
= (qdisk
+ 1 + j
) % raid_disks
;
492 if (dnum
== disk
|| dnum
== qdisk
)
494 for (i
= 0; i
< data_disks
; i
++)
496 start
/chunk_size
/data_disks
,
497 raid_disks
, level
, layout
) == dnum
)
499 /* i is the logical block number, so is index to 'buf'.
500 * dnum is physical disk number
501 * snum is syndrome disk for which 0 is immediately after Q
503 bufs
[snum
] = (uint8_t*)buf
+ chunk_size
* i
;
512 syndrome_disks
= data_disks
;
515 /* Place P and Q blocks at end of bufs */
516 bufs
[syndrome_disks
] = (uint8_t*)buf
+ chunk_size
* data_disks
;
517 bufs
[syndrome_disks
+1] = (uint8_t*)buf
+ chunk_size
* (data_disks
+1);
519 if (fblock
[1] == data_disks
)
520 /* One data failed, and parity failed */
521 raid6_datap_recov(syndrome_disks
+2, chunk_size
,
524 if (fdisk
[0] > fdisk
[1]) {
529 /* Two data blocks failed, P,Q OK */
530 raid6_2data_recov(syndrome_disks
+2, chunk_size
,
531 fdisk
[0], fdisk
[1], bufs
);
535 for (i
=0; i
<nwrites
; i
++)
536 if (write(dest
[i
], buf
, len
) != len
)
547 * A list of 'fds' of the active disks. Some may be '-1' for not-available.
548 * A geometry: raid_disks, chunk_size, level, layout
549 * An 'fd' to read from. It is already seeked to the right (Read) location.
550 * A start and length.
551 * The length must be a multiple of the stripe size.
553 * We build a full stripe in memory and then write it out.
554 * We assume that there are enough working devices.
556 int restore_stripes(int *dest
, unsigned long long *offsets
,
557 int raid_disks
, int chunk_size
, int level
, int layout
,
558 int source
, unsigned long long read_offset
,
559 unsigned long long start
, unsigned long long length
)
562 char **stripes
= malloc(raid_disks
* sizeof(char*));
563 char **blocks
= malloc(raid_disks
* sizeof(char*));
566 int data_disks
= raid_disks
- (level
== 0 ? 0 : level
<= 5 ? 1 : 2);
568 posix_memalign((void**)&stripe_buf
, 4096, raid_disks
* chunk_size
);
570 zero
= malloc(chunk_size
);
572 memset(zero
, 0, chunk_size
);
574 if (stripe_buf
== NULL
|| stripes
== NULL
|| blocks
== NULL
582 for (i
=0; i
<raid_disks
; i
++)
583 stripes
[i
] = stripe_buf
+ i
* chunk_size
;
585 int len
= data_disks
* chunk_size
;
586 unsigned long long offset
;
591 for (i
=0; i
< data_disks
; i
++) {
592 int disk
= geo_map(i
, start
/chunk_size
/data_disks
,
593 raid_disks
, level
, layout
);
594 if (lseek64(source
, read_offset
, 0) != read_offset
)
596 if (read(source
, stripes
[disk
], chunk_size
) != chunk_size
)
598 read_offset
+= chunk_size
;
600 /* We have the data, now do the parity */
601 offset
= (start
/chunk_size
/data_disks
) * chunk_size
;
605 disk
= geo_map(-1, start
/chunk_size
/data_disks
,
606 raid_disks
, level
, layout
);
607 for (i
= 0; i
< data_disks
; i
++)
608 blocks
[i
] = stripes
[(disk
+1+i
) % raid_disks
];
609 xor_blocks(stripes
[disk
], blocks
, data_disks
, chunk_size
);
612 disk
= geo_map(-1, start
/chunk_size
/data_disks
,
613 raid_disks
, level
, layout
);
614 qdisk
= geo_map(-2, start
/chunk_size
/data_disks
,
615 raid_disks
, level
, layout
);
616 if (is_ddf(layout
)) {
617 /* q over 'raid_disks' blocks, in device order.
618 * 'p' and 'q' get to be all zero
620 for (i
= 0; i
< raid_disks
; i
++)
621 if (i
== disk
|| i
== qdisk
)
622 blocks
[i
] = (char*)zero
;
624 blocks
[i
] = stripes
[i
];
625 syndrome_disks
= raid_disks
;
627 /* for md, q is over 'data_disks' blocks,
628 * starting immediately after 'q'
630 for (i
= 0; i
< data_disks
; i
++)
631 blocks
[i
] = stripes
[(qdisk
+1+i
) % raid_disks
];
633 syndrome_disks
= data_disks
;
635 qsyndrome((uint8_t*)stripes
[disk
],
636 (uint8_t*)stripes
[qdisk
],
638 syndrome_disks
, chunk_size
);
641 for (i
=0; i
< raid_disks
; i
++)
643 if (lseek64(dest
[i
], offsets
[i
]+offset
, 0) < 0)
645 if (write(dest
[i
], stripes
[i
], chunk_size
) != chunk_size
)
656 int test_stripes(int *source
, unsigned long long *offsets
,
657 int raid_disks
, int chunk_size
, int level
, int layout
,
658 unsigned long long start
, unsigned long long length
)
660 /* ready the data and p (and q) blocks, and check we got them right */
661 char *stripe_buf
= malloc(raid_disks
* chunk_size
);
662 char **stripes
= malloc(raid_disks
* sizeof(char*));
663 char **blocks
= malloc(raid_disks
* sizeof(char*));
664 char *p
= malloc(chunk_size
);
665 char *q
= malloc(chunk_size
);
668 int data_disks
= raid_disks
- (level
== 5 ? 1: 2);
669 for ( i
= 0 ; i
< raid_disks
; i
++)
670 stripes
[i
] = stripe_buf
+ i
* chunk_size
;
675 for (i
= 0 ; i
< raid_disks
; i
++) {
676 lseek64(source
[i
], offsets
[i
]+start
, 0);
677 read(source
[i
], stripes
[i
], chunk_size
);
679 for (i
= 0 ; i
< data_disks
; i
++) {
680 int disk
= geo_map(i
, start
/chunk_size
, raid_disks
,
682 blocks
[i
] = stripes
[disk
];
683 printf("%d->%d\n", i
, disk
);
687 qsyndrome(p
, q
, (uint8_t**)blocks
, data_disks
, chunk_size
);
688 disk
= geo_map(-1, start
/chunk_size
, raid_disks
,
690 if (memcmp(p
, stripes
[disk
], chunk_size
) != 0) {
691 printf("P(%d) wrong at %llu\n", disk
,
694 disk
= geo_map(-2, start
/chunk_size
, raid_disks
,
696 if (memcmp(q
, stripes
[disk
], chunk_size
) != 0) {
697 printf("Q(%d) wrong at %llu\n", disk
,
702 length
-= chunk_size
;
708 unsigned long long getnum(char *str
, char **err
)
711 unsigned long long rv
= strtoull(str
, &e
, 10);
719 main(int argc
, char *argv
[])
721 /* save/restore file raid_disks chunk_size level layout start length devices...
728 unsigned long long *offsets
;
729 int raid_disks
, chunk_size
, level
, layout
;
730 unsigned long long start
, length
;
735 fprintf(stderr
, "Usage: test_stripe save/restore file raid_disks"
736 " chunk_size level layout start length devices...\n");
739 if (strcmp(argv
[1], "save")==0)
741 else if (strcmp(argv
[1], "restore") == 0)
743 else if (strcmp(argv
[1], "test") == 0)
746 fprintf(stderr
, "test_stripe: must give 'save' or 'restore'.\n");
751 raid_disks
= getnum(argv
[3], &err
);
752 chunk_size
= getnum(argv
[4], &err
);
753 level
= getnum(argv
[5], &err
);
754 layout
= getnum(argv
[6], &err
);
755 start
= getnum(argv
[7], &err
);
756 length
= getnum(argv
[8], &err
);
758 fprintf(stderr
, "test_stripe: Bad number: %s\n", err
);
761 if (argc
!= raid_disks
+ 9) {
762 fprintf(stderr
, "test_stripe: wrong number of devices: want %d found %d\n",
766 fds
= malloc(raid_disks
* sizeof(*fds
));
767 offsets
= malloc(raid_disks
* sizeof(*offsets
));
768 memset(offsets
, 0, raid_disks
* sizeof(*offsets
));
770 storefd
= open(file
, O_RDWR
);
773 fprintf(stderr
, "test_stripe: could not open %s.\n", file
);
776 for (i
=0; i
<raid_disks
; i
++) {
777 fds
[i
] = open(argv
[9+i
], O_RDWR
);
780 fprintf(stderr
,"test_stripe: cannot open %s.\n", argv
[9+i
]);
785 buf
= malloc(raid_disks
* chunk_size
);
788 int rv
= save_stripes(fds
, offsets
,
789 raid_disks
, chunk_size
, level
, layout
,
794 "test_stripe: save_stripes returned %d\n", rv
);
797 } else if (save
== 2) {
798 int rv
= test_stripes(fds
, offsets
,
799 raid_disks
, chunk_size
, level
, layout
,
803 "test_stripe: test_stripes returned %d\n", rv
);
807 int rv
= restore_stripes(fds
, offsets
,
808 raid_disks
, chunk_size
, level
, layout
,
813 "test_stripe: restore_stripes returned %d\n",