]> git.ipfire.org Git - thirdparty/mdadm.git/blame - restripe.c
raid6check: various cleanup/fixes
[thirdparty/mdadm.git] / restripe.c
CommitLineData
e86c9dd6
NB
1/*
2 * mdadm - manage Linux "md" devices aka RAID arrays.
3 *
e736b623 4 * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de>
e86c9dd6
NB
5 *
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 *
21 * Author: Neil Brown
22 * Email: <neilb@suse.de>
23 */
24
25#include "mdadm.h"
a6288483 26#include <stdint.h>
e86c9dd6
NB
27
28/* To restripe, we read from old geometry to a buffer, and
29 * read from buffer to new geometry.
a6288483
N
30 * When reading, we might have missing devices and so could need
31 * to reconstruct.
32 * When writing, we need to create correct parity and Q.
e86c9dd6
NB
33 *
34 */
35
979afcb8 36int geo_map(int block, unsigned long long stripe, int raid_disks,
e0d95aac 37 int level, int layout)
e86c9dd6 38{
48327135 39 /* On the given stripe, find which disk in the array will have
e86c9dd6 40 * block numbered 'block'.
48327135
NB
41 * '-1' means the parity block.
42 * '-2' means the Q syndrome.
e86c9dd6
NB
43 */
44 int pd;
45
b6e317c8
AK
46 /* layout is not relevant for raid0 and raid4 */
47 if ((level == 0) ||
48 (level == 4))
49 layout = 0;
50
e86c9dd6
NB
51 switch(level*100 + layout) {
52 case 000:
53 case 400:
e0d95aac 54 case 500 + ALGORITHM_PARITY_N:
e86c9dd6
NB
55 /* raid 4 isn't messed around by parity blocks */
56 if (block == -1)
57 return raid_disks-1; /* parity block */
58 return block;
59 case 500 + ALGORITHM_LEFT_ASYMMETRIC:
60 pd = (raid_disks-1) - stripe % raid_disks;
61 if (block == -1) return pd;
62 if (block >= pd)
63 block++;
64 return block;
65
66 case 500 + ALGORITHM_RIGHT_ASYMMETRIC:
67 pd = stripe % raid_disks;
68 if (block == -1) return pd;
69 if (block >= pd)
70 block++;
71 return block;
72
73 case 500 + ALGORITHM_LEFT_SYMMETRIC:
74 pd = (raid_disks - 1) - stripe % raid_disks;
75 if (block == -1) return pd;
76 return (pd + 1 + block) % raid_disks;
77
78 case 500 + ALGORITHM_RIGHT_SYMMETRIC:
79 pd = stripe % raid_disks;
80 if (block == -1) return pd;
81 return (pd + 1 + block) % raid_disks;
82
e0d95aac
N
83 case 500 + ALGORITHM_PARITY_0:
84 return block + 1;
85
e0d95aac
N
86 case 600 + ALGORITHM_PARITY_N_6:
87 if (block == -2)
88 return raid_disks - 1;
89 if (block == -1)
90 return raid_disks - 2; /* parity block */
91 return block;
92 case 600 + ALGORITHM_LEFT_ASYMMETRIC_6:
93 if (block == -2)
94 return raid_disks - 1;
95 raid_disks--;
96 pd = (raid_disks-1) - stripe % raid_disks;
97 if (block == -1) return pd;
98 if (block >= pd)
99 block++;
100 return block;
101
102 case 600 + ALGORITHM_RIGHT_ASYMMETRIC_6:
103 if (block == -2)
104 return raid_disks - 1;
105 raid_disks--;
106 pd = stripe % raid_disks;
107 if (block == -1) return pd;
108 if (block >= pd)
109 block++;
110 return block;
111
112 case 600 + ALGORITHM_LEFT_SYMMETRIC_6:
113 if (block == -2)
114 return raid_disks - 1;
115 raid_disks--;
116 pd = (raid_disks - 1) - stripe % raid_disks;
117 if (block == -1) return pd;
118 return (pd + 1 + block) % raid_disks;
119
120 case 600 + ALGORITHM_RIGHT_SYMMETRIC_6:
121 if (block == -2)
122 return raid_disks - 1;
123 raid_disks--;
124 pd = stripe % raid_disks;
125 if (block == -1) return pd;
126 return (pd + 1 + block) % raid_disks;
127
128 case 600 + ALGORITHM_PARITY_0_6:
129 if (block == -2)
130 return raid_disks - 1;
131 return block + 1;
132
e0d95aac
N
133 case 600 + ALGORITHM_PARITY_0:
134 if (block == -1)
135 return 0;
136 if (block == -2)
137 return 1;
138 return block + 2;
139
e86c9dd6
NB
140 case 600 + ALGORITHM_LEFT_ASYMMETRIC:
141 pd = raid_disks - 1 - (stripe % raid_disks);
142 if (block == -1) return pd;
48327135 143 if (block == -2) return (pd+1) % raid_disks;
e86c9dd6
NB
144 if (pd == raid_disks - 1)
145 return block+1;
146 if (block >= pd)
147 return block+2;
148 return block;
149
e0d95aac
N
150 case 600 + ALGORITHM_ROTATING_ZERO_RESTART:
151 /* Different order for calculating Q, otherwize same as ... */
e86c9dd6
NB
152 case 600 + ALGORITHM_RIGHT_ASYMMETRIC:
153 pd = stripe % raid_disks;
154 if (block == -1) return pd;
48327135 155 if (block == -2) return (pd+1) % raid_disks;
e86c9dd6
NB
156 if (pd == raid_disks - 1)
157 return block+1;
158 if (block >= pd)
159 return block+2;
160 return block;
161
162 case 600 + ALGORITHM_LEFT_SYMMETRIC:
163 pd = raid_disks - 1 - (stripe % raid_disks);
164 if (block == -1) return pd;
48327135 165 if (block == -2) return (pd+1) % raid_disks;
e86c9dd6
NB
166 return (pd + 2 + block) % raid_disks;
167
168 case 600 + ALGORITHM_RIGHT_SYMMETRIC:
169 pd = stripe % raid_disks;
170 if (block == -1) return pd;
48327135 171 if (block == -2) return (pd+1) % raid_disks;
e86c9dd6 172 return (pd + 2 + block) % raid_disks;
e0d95aac 173
e0d95aac
N
174 case 600 + ALGORITHM_ROTATING_N_RESTART:
175 /* Same a left_asymmetric, by first stripe is
176 * D D D P Q rather than
177 * Q D D D P
178 */
179 pd = raid_disks - 1 - ((stripe + 1) % raid_disks);
180 if (block == -1) return pd;
181 if (block == -2) return (pd+1) % raid_disks;
182 if (pd == raid_disks - 1)
183 return block+1;
184 if (block >= pd)
185 return block+2;
186 return block;
187
188 case 600 + ALGORITHM_ROTATING_N_CONTINUE:
189 /* Same as left_symmetric but Q is before P */
190 pd = raid_disks - 1 - (stripe % raid_disks);
191 if (block == -1) return pd;
192 if (block == -2) return (pd+raid_disks-1) % raid_disks;
193 return (pd + 1 + block) % raid_disks;
e86c9dd6
NB
194 }
195 return -1;
196}
ad1a3c2f
N
197
198int is_ddf(int layout)
e0d95aac
N
199{
200 switch (layout)
201 {
202 default:
203 return 0;
204 case ALGORITHM_ROTATING_N_CONTINUE:
205 case ALGORITHM_ROTATING_N_RESTART:
206 case ALGORITHM_ROTATING_ZERO_RESTART:
207 return 1;
208 }
209}
e86c9dd6 210
59679536 211void xor_blocks(char *target, char **sources, int disks, int size)
e86c9dd6
NB
212{
213 int i, j;
214 /* Amazingly inefficient... */
215 for (i=0; i<size; i++) {
216 char c = 0;
217 for (j=0 ; j<disks; j++)
218 c ^= sources[j][i];
219 target[i] = c;
220 }
221}
222
979afcb8 223void qsyndrome(uint8_t *p, uint8_t *q, uint8_t **sources, int disks, int size)
48327135
NB
224{
225 int d, z;
a6288483 226 uint8_t wq0, wp0, wd0, w10, w20;
48327135
NB
227 for ( d = 0; d < size; d++) {
228 wq0 = wp0 = sources[disks-1][d];
229 for ( z = disks-2 ; z >= 0 ; z-- ) {
230 wd0 = sources[z][d];
231 wp0 ^= wd0;
232 w20 = (wq0&0x80) ? 0xff : 0x00;
233 w10 = (wq0 << 1) & 0xff;
234 w20 &= 0x1d;
235 w10 ^= w20;
236 wq0 = w10 ^ wd0;
237 }
238 p[d] = wp0;
239 q[d] = wq0;
240 }
241}
242
a6288483
N
243/*
244 * The following was taken from linux/drivers/md/mktables.c, and modified
245 * to create in-memory tables rather than C code
246 */
247static uint8_t gfmul(uint8_t a, uint8_t b)
248{
249 uint8_t v = 0;
250
251 while (b) {
252 if (b & 1)
253 v ^= a;
254 a = (a << 1) ^ (a & 0x80 ? 0x1d : 0);
255 b >>= 1;
256 }
257
258 return v;
259}
260
261static uint8_t gfpow(uint8_t a, int b)
262{
263 uint8_t v = 1;
264
265 b %= 255;
266 if (b < 0)
267 b += 255;
268
269 while (b) {
270 if (b & 1)
271 v = gfmul(v, a);
272 a = gfmul(a, a);
273 b >>= 1;
274 }
275
276 return v;
277}
278
279int tables_ready = 0;
280uint8_t raid6_gfmul[256][256];
281uint8_t raid6_gfexp[256];
282uint8_t raid6_gfinv[256];
283uint8_t raid6_gfexi[256];
9d0e7840
PS
284uint8_t raid6_gflog[256];
285uint8_t raid6_gfilog[256];
a6288483
N
286void make_tables(void)
287{
288 int i, j;
289 uint8_t v;
9d0e7840 290 uint32_t b, log;
a6288483
N
291
292 /* Compute multiplication table */
293 for (i = 0; i < 256; i++)
294 for (j = 0; j < 256; j++)
295 raid6_gfmul[i][j] = gfmul(i, j);
296
297 /* Compute power-of-2 table (exponent) */
298 v = 1;
299 for (i = 0; i < 256; i++) {
300 raid6_gfexp[i] = v;
301 v = gfmul(v, 2);
302 if (v == 1)
303 v = 0; /* For entry 255, not a real entry */
304 }
305
306 /* Compute inverse table x^-1 == x^254 */
307 for (i = 0; i < 256; i++)
308 raid6_gfinv[i] = gfpow(i, 254);
309
310 /* Compute inv(2^x + 1) (exponent-xor-inverse) table */
311 for (i = 0; i < 256; i ++)
312 raid6_gfexi[i] = raid6_gfinv[raid6_gfexp[i] ^ 1];
313
9d0e7840
PS
314 /* Compute log and inverse log */
315 /* Modified code from:
316 * http://web.eecs.utk.edu/~plank/plank/papers/CS-96-332.html
317 */
318 b = 1;
319 raid6_gflog[0] = 0;
320 raid6_gfilog[255] = 0;
321
322 for (log = 0; log < 255; log++) {
323 raid6_gflog[b] = (uint8_t) log;
324 raid6_gfilog[log] = (uint8_t) b;
325 b = b << 1;
326 if (b & 256) b = b ^ 0435;
327 }
328
a6288483
N
329 tables_ready = 1;
330}
331
332uint8_t *zero;
d47a2925 333int zero_size;
59679536
RB
334
335void ensure_zero_has_size(int chunk_size)
336{
337 if (zero == NULL || chunk_size > zero_size) {
338 if (zero)
339 free(zero);
340 zero = xcalloc(1, chunk_size);
341 zero_size = chunk_size;
342 }
343}
344
a6288483
N
345/* Following was taken from linux/drivers/md/raid6recov.c */
346
347/* Recover two failed data blocks. */
50786d47 348
a6288483 349void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
50786d47 350 uint8_t **ptrs, int neg_offset)
a6288483
N
351{
352 uint8_t *p, *q, *dp, *dq;
353 uint8_t px, qx, db;
354 const uint8_t *pbmul; /* P multiplier table for B data */
355 const uint8_t *qmul; /* Q multiplier table (for both) */
356
50786d47
N
357 if (neg_offset) {
358 p = ptrs[-1];
359 q = ptrs[-2];
360 } else {
361 p = ptrs[disks-2];
362 q = ptrs[disks-1];
363 }
a6288483
N
364
365 /* Compute syndrome with zero for the missing data pages
366 Use the dead data pages as temporary storage for
367 delta p and delta q */
368 dp = ptrs[faila];
369 ptrs[faila] = zero;
370 dq = ptrs[failb];
371 ptrs[failb] = zero;
372
373 qsyndrome(dp, dq, ptrs, disks-2, bytes);
374
375 /* Restore pointer table */
376 ptrs[faila] = dp;
377 ptrs[failb] = dq;
378
379 /* Now, pick the proper data tables */
380 pbmul = raid6_gfmul[raid6_gfexi[failb-faila]];
381 qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]];
382
383 /* Now do it... */
384 while ( bytes-- ) {
385 px = *p ^ *dp;
386 qx = qmul[*q ^ *dq];
387 *dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */
388 *dp++ = db ^ px; /* Reconstructed A */
389 p++; q++;
390 }
391}
392
393/* Recover failure of one data block plus the P block */
50786d47
N
394void raid6_datap_recov(int disks, size_t bytes, int faila, uint8_t **ptrs,
395 int neg_offset)
a6288483
N
396{
397 uint8_t *p, *q, *dq;
398 const uint8_t *qmul; /* Q multiplier table */
399
50786d47
N
400 if (neg_offset) {
401 p = ptrs[-1];
402 q = ptrs[-2];
403 } else {
404 p = ptrs[disks-2];
405 q = ptrs[disks-1];
406 }
a6288483
N
407
408 /* Compute syndrome with zero for the missing data page
409 Use the dead data page as temporary storage for delta q */
410 dq = ptrs[faila];
411 ptrs[faila] = zero;
412
413 qsyndrome(p, dq, ptrs, disks-2, bytes);
414
415 /* Restore pointer table */
416 ptrs[faila] = dq;
417
418 /* Now, pick the proper data tables */
419 qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]];
420
421 /* Now do it... */
422 while ( bytes-- ) {
423 *p++ ^= *dq = qmul[*q ^ *dq];
424 q++; dq++;
425 }
426}
427
9d0e7840
PS
428/* Try to find out if a specific disk has a problem */
429int raid6_check_disks(int data_disks, int start, int chunk_size,
430 int level, int layout, int diskP, int diskQ,
431 char *p, char *q, char **stripes)
432{
433 int i;
434 int data_id, diskD;
435 uint8_t Px, Qx;
436 int curr_broken_disk = -1;
437 int prev_broken_disk = -1;
438 int broken_status = 0;
439
440 for(i = 0; i < chunk_size; i++) {
441 Px = (uint8_t)stripes[diskP][i] ^ (uint8_t)p[i];
442 Qx = (uint8_t)stripes[diskQ][i] ^ (uint8_t)q[i];
443
444 if((Px != 0) && (Qx == 0))
445 curr_broken_disk = diskP;
446
9d0e7840
PS
447 if((Px == 0) && (Qx != 0))
448 curr_broken_disk = diskQ;
449
9d0e7840 450 if((Px != 0) && (Qx != 0)) {
c4db5301
PS
451 data_id = (raid6_gflog[Qx] - raid6_gflog[Px]);
452 if(data_id < 0) data_id += 255;
9d0e7840
PS
453 diskD = geo_map(data_id, start/chunk_size,
454 data_disks + 2, level, layout);
455 curr_broken_disk = diskD;
456 }
457
458 if((Px == 0) && (Qx == 0))
459 curr_broken_disk = curr_broken_disk;
460
c4db5301
PS
461 if(curr_broken_disk >= data_disks + 2)
462 broken_status = 2;
463
9d0e7840
PS
464 switch(broken_status) {
465 case 0:
466 if(curr_broken_disk != -1) {
467 prev_broken_disk = curr_broken_disk;
468 broken_status = 1;
469 }
470 break;
471
472 case 1:
473 if(curr_broken_disk != prev_broken_disk)
474 broken_status = 2;
9d0e7840
PS
475 break;
476
477 case 2:
478 default:
479 curr_broken_disk = prev_broken_disk = -2;
480 break;
481 }
482 }
483
484 return curr_broken_disk;
485}
486
2fcb75ae
AK
487/*******************************************************************************
488 * Function: save_stripes
489 * Description:
490 * Function reads data (only data without P and Q) from array and writes
491 * it to buf and opcjonaly to backup files
492 * Parameters:
493 * source : A list of 'fds' of the active disks.
494 * Some may be absent
495 * offsets : A list of offsets on disk belonging
496 * to the array [bytes]
497 * raid_disks : geometry: number of disks in the array
498 * chunk_size : geometry: chunk size [bytes]
499 * level : geometry: RAID level
500 * layout : geometry: layout
501 * nwrites : number of backup files
502 * dest : A list of 'fds' for mirrored targets
503 * (e.g. backup files). They are already seeked to right
504 * (write) location. If NULL, data will be wrote
505 * to the buf only
506 * start : start address of data to read (must be stripe-aligned)
507 * [bytes]
508 * length - : length of data to read (must be stripe-aligned)
509 * [bytes]
510 * buf : buffer for data. It is large enough to hold
511 * one stripe. It is stripe aligned
512 * Returns:
513 * 0 : success
514 * -1 : fail
515 ******************************************************************************/
e86c9dd6
NB
516int save_stripes(int *source, unsigned long long *offsets,
517 int raid_disks, int chunk_size, int level, int layout,
518 int nwrites, int *dest,
a6288483
N
519 unsigned long long start, unsigned long long length,
520 char *buf)
e86c9dd6 521{
e86c9dd6
NB
522 int len;
523 int data_disks = raid_disks - (level == 0 ? 0 : level <=5 ? 1 : 2);
524 int disk;
a6288483 525 int i;
2fcb75ae 526 unsigned long long length_test;
e86c9dd6 527
a6288483
N
528 if (!tables_ready)
529 make_tables();
59679536 530 ensure_zero_has_size(chunk_size);
a6288483
N
531
532 len = data_disks * chunk_size;
2fcb75ae
AK
533 length_test = length / len;
534 length_test *= len;
535
536 if (length != length_test) {
537 dprintf("Error: save_stripes(): Data are not alligned. EXIT\n");
538 dprintf("\tArea for saving stripes (length) = %llu\n", length);
539 dprintf("\tWork step (len) = %i\n", len);
540 dprintf("\tExpected save area (length_test) = %llu\n",
541 length_test);
542 abort();
543 }
544
e86c9dd6 545 while (length > 0) {
a6288483
N
546 int failed = 0;
547 int fdisk[3], fblock[3];
548 for (disk = 0; disk < raid_disks ; disk++) {
549 unsigned long long offset;
550 int dnum;
a6288483
N
551
552 offset = (start/chunk_size/data_disks)*chunk_size;
553 dnum = geo_map(disk < data_disks ? disk : data_disks - disk - 1,
554 start/chunk_size/data_disks,
555 raid_disks, level, layout);
7236ee7a 556 if (dnum < 0) abort();
a6288483 557 if (source[dnum] < 0 ||
cc50ccdc 558 lseek64(source[dnum], offsets[dnum]+offset, 0) < 0 ||
7236ee7a
N
559 read(source[dnum], buf+disk * chunk_size, chunk_size)
560 != chunk_size)
a6288483
N
561 if (failed <= 2) {
562 fdisk[failed] = dnum;
563 fblock[failed] = disk;
564 failed++;
565 }
566 }
567 if (failed == 0 || fblock[0] >= data_disks)
568 /* all data disks are good */
569 ;
570 else if (failed == 1 || fblock[1] >= data_disks+1) {
571 /* one failed data disk and good parity */
572 char *bufs[data_disks];
573 for (i=0; i < data_disks; i++)
574 if (fblock[0] == i)
575 bufs[i] = buf + data_disks*chunk_size;
576 else
577 bufs[i] = buf + i*chunk_size;
578
579 xor_blocks(buf + fblock[0]*chunk_size,
580 bufs, data_disks, chunk_size);
581 } else if (failed > 2 || level != 6)
582 /* too much failure */
e86c9dd6 583 return -1;
a6288483
N
584 else {
585 /* RAID6 computations needed. */
586 uint8_t *bufs[data_disks+4];
587 int qdisk;
588 int syndrome_disks;
589 disk = geo_map(-1, start/chunk_size/data_disks,
590 raid_disks, level, layout);
591 qdisk = geo_map(-2, start/chunk_size/data_disks,
592 raid_disks, level, layout);
593 if (is_ddf(layout)) {
594 /* q over 'raid_disks' blocks, in device order.
595 * 'p' and 'q' get to be all zero
596 */
597 for (i = 0; i < raid_disks; i++)
cc50ccdc
N
598 bufs[i] = zero;
599 for (i = 0; i < data_disks; i++) {
600 int dnum = geo_map(i,
601 start/chunk_size/data_disks,
602 raid_disks, level, layout);
603 int snum;
604 /* i is the logical block number, so is index to 'buf'.
605 * dnum is physical disk number
606 * and thus the syndrome number.
607 */
608 snum = dnum;
609 bufs[snum] = (uint8_t*)buf + chunk_size * i;
610 }
a6288483
N
611 syndrome_disks = raid_disks;
612 } else {
613 /* for md, q is over 'data_disks' blocks,
614 * starting immediately after 'q'
1eac9f84
N
615 * Note that for the '_6' variety, the p block
616 * makes a hole that we need to be careful of.
a6288483 617 */
1eac9f84
N
618 int j;
619 int snum = 0;
620 for (j = 0; j < raid_disks; j++) {
621 int dnum = (qdisk + 1 + j) % raid_disks;
622 if (dnum == disk || dnum == qdisk)
623 continue;
624 for (i = 0; i < data_disks; i++)
625 if (geo_map(i,
626 start/chunk_size/data_disks,
627 raid_disks, level, layout) == dnum)
628 break;
cc50ccdc
N
629 /* i is the logical block number, so is index to 'buf'.
630 * dnum is physical disk number
631 * snum is syndrome disk for which 0 is immediately after Q
632 */
cc50ccdc 633 bufs[snum] = (uint8_t*)buf + chunk_size * i;
1eac9f84
N
634
635 if (fblock[0] == i)
636 fdisk[0] = snum;
637 if (fblock[1] == i)
638 fdisk[1] = snum;
639 snum++;
cc50ccdc 640 }
a6288483 641
a6288483
N
642 syndrome_disks = data_disks;
643 }
cc50ccdc
N
644
645 /* Place P and Q blocks at end of bufs */
646 bufs[syndrome_disks] = (uint8_t*)buf + chunk_size * data_disks;
647 bufs[syndrome_disks+1] = (uint8_t*)buf + chunk_size * (data_disks+1);
648
a6288483
N
649 if (fblock[1] == data_disks)
650 /* One data failed, and parity failed */
651 raid6_datap_recov(syndrome_disks+2, chunk_size,
50786d47 652 fdisk[0], bufs, 0);
cc50ccdc
N
653 else {
654 if (fdisk[0] > fdisk[1]) {
655 int t = fdisk[0];
656 fdisk[0] = fdisk[1];
657 fdisk[1] = t;
658 }
a6288483
N
659 /* Two data blocks failed, P,Q OK */
660 raid6_2data_recov(syndrome_disks+2, chunk_size,
50786d47 661 fdisk[0], fdisk[1], bufs, 0);
cc50ccdc 662 }
a6288483 663 }
ccced3dc 664 if (dest) {
2fcb75ae
AK
665 for (i = 0; i < nwrites; i++)
666 if (write(dest[i], buf, len) != len)
667 return -1;
ccced3dc
AK
668 } else {
669 /* build next stripe in buffer */
670 buf += len;
671 }
e86c9dd6
NB
672 length -= len;
673 start += len;
e86c9dd6
NB
674 }
675 return 0;
676}
677
678/* Restore data:
679 * We are given:
680 * A list of 'fds' of the active disks. Some may be '-1' for not-available.
353632d9 681 * A geometry: raid_disks, chunk_size, level, layout
e86c9dd6
NB
682 * An 'fd' to read from. It is already seeked to the right (Read) location.
683 * A start and length.
684 * The length must be a multiple of the stripe size.
685 *
686 * We build a full stripe in memory and then write it out.
687 * We assume that there are enough working devices.
688 */
689int restore_stripes(int *dest, unsigned long long *offsets,
690 int raid_disks, int chunk_size, int level, int layout,
353632d9 691 int source, unsigned long long read_offset,
2fcb75ae
AK
692 unsigned long long start, unsigned long long length,
693 char *src_buf)
e86c9dd6 694{
e9e43ec3 695 char *stripe_buf;
503975b9
N
696 char **stripes = xmalloc(raid_disks * sizeof(char*));
697 char **blocks = xmalloc(raid_disks * sizeof(char*));
e86c9dd6 698 int i;
758be4f1 699 int rv;
e86c9dd6 700
a6288483 701 int data_disks = raid_disks - (level == 0 ? 0 : level <= 5 ? 1 : 2);
e86c9dd6 702
fcf57625
N
703 if (posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size))
704 stripe_buf = NULL;
d47a2925
N
705
706 if (zero == NULL || chunk_size > zero_size) {
707 if (zero)
708 free(zero);
503975b9 709 zero = xcalloc(1, chunk_size);
d47a2925 710 zero_size = chunk_size;
a6288483 711 }
d47a2925 712
e0d95aac
N
713 if (stripe_buf == NULL || stripes == NULL || blocks == NULL
714 || zero == NULL) {
758be4f1
LD
715 rv = -2;
716 goto abort;
e86c9dd6 717 }
2fcb75ae 718 for (i = 0; i < raid_disks; i++)
e86c9dd6
NB
719 stripes[i] = stripe_buf + i * chunk_size;
720 while (length > 0) {
f21e18ca 721 unsigned int len = data_disks * chunk_size;
e86c9dd6 722 unsigned long long offset;
48327135 723 int disk, qdisk;
a6288483 724 int syndrome_disks;
758be4f1
LD
725 if (length < len) {
726 rv = -3;
727 goto abort;
728 }
2fcb75ae 729 for (i = 0; i < data_disks; i++) {
e86c9dd6
NB
730 int disk = geo_map(i, start/chunk_size/data_disks,
731 raid_disks, level, layout);
2fcb75ae
AK
732 if (src_buf == NULL) {
733 /* read from file */
758be4f1
LD
734 if (lseek64(source, read_offset, 0) !=
735 (off64_t)read_offset) {
736 rv = -1;
737 goto abort;
738 }
2fcb75ae
AK
739 if (read(source,
740 stripes[disk],
758be4f1
LD
741 chunk_size) != chunk_size) {
742 rv = -1;
743 goto abort;
744 }
2fcb75ae
AK
745 } else {
746 /* read from input buffer */
747 memcpy(stripes[disk],
748 src_buf + read_offset,
749 chunk_size);
750 }
353632d9 751 read_offset += chunk_size;
e86c9dd6
NB
752 }
753 /* We have the data, now do the parity */
754 offset = (start/chunk_size/data_disks) * chunk_size;
48327135
NB
755 switch (level) {
756 case 4:
757 case 5:
758 disk = geo_map(-1, start/chunk_size/data_disks,
e86c9dd6 759 raid_disks, level, layout);
e0d95aac
N
760 for (i = 0; i < data_disks; i++)
761 blocks[i] = stripes[(disk+1+i) % raid_disks];
e86c9dd6 762 xor_blocks(stripes[disk], blocks, data_disks, chunk_size);
48327135
NB
763 break;
764 case 6:
765 disk = geo_map(-1, start/chunk_size/data_disks,
766 raid_disks, level, layout);
767 qdisk = geo_map(-2, start/chunk_size/data_disks,
768 raid_disks, level, layout);
e0d95aac
N
769 if (is_ddf(layout)) {
770 /* q over 'raid_disks' blocks, in device order.
771 * 'p' and 'q' get to be all zero
772 */
773 for (i = 0; i < raid_disks; i++)
774 if (i == disk || i == qdisk)
a6288483 775 blocks[i] = (char*)zero;
e0d95aac
N
776 else
777 blocks[i] = stripes[i];
a6288483 778 syndrome_disks = raid_disks;
e0d95aac 779 } else {
a6288483 780 /* for md, q is over 'data_disks' blocks,
e0d95aac
N
781 * starting immediately after 'q'
782 */
783 for (i = 0; i < data_disks; i++)
784 blocks[i] = stripes[(qdisk+1+i) % raid_disks];
48327135 785
a6288483 786 syndrome_disks = data_disks;
e0d95aac 787 }
a6288483 788 qsyndrome((uint8_t*)stripes[disk],
1011e834 789 (uint8_t*)stripes[qdisk],
a6288483
N
790 (uint8_t**)blocks,
791 syndrome_disks, chunk_size);
48327135 792 break;
e86c9dd6
NB
793 }
794 for (i=0; i < raid_disks ; i++)
795 if (dest[i] >= 0) {
758be4f1
LD
796 if (lseek64(dest[i],
797 offsets[i]+offset, 0) < 0) {
798 rv = -1;
799 goto abort;
800 }
801 if (write(dest[i], stripes[i],
802 chunk_size) != chunk_size) {
803 rv = -1;
804 goto abort;
805 }
e86c9dd6
NB
806 }
807 length -= len;
808 start += len;
809 }
758be4f1
LD
810 rv = 0;
811
812abort:
813 free(stripe_buf);
814 free(stripes);
815 free(blocks);
816 return rv;
e86c9dd6
NB
817}
818
819#ifdef MAIN
820
48327135
NB
821int test_stripes(int *source, unsigned long long *offsets,
822 int raid_disks, int chunk_size, int level, int layout,
823 unsigned long long start, unsigned long long length)
824{
825 /* ready the data and p (and q) blocks, and check we got them right */
503975b9
N
826 char *stripe_buf = xmalloc(raid_disks * chunk_size);
827 char **stripes = xmalloc(raid_disks * sizeof(char*));
828 char **blocks = xmalloc(raid_disks * sizeof(char*));
829 char *p = xmalloc(chunk_size);
830 char *q = xmalloc(chunk_size);
48327135
NB
831
832 int i;
9d0e7840 833 int diskP, diskQ;
48327135 834 int data_disks = raid_disks - (level == 5 ? 1: 2);
9d0e7840
PS
835
836 if (!tables_ready)
837 make_tables();
838
48327135
NB
839 for ( i = 0 ; i < raid_disks ; i++)
840 stripes[i] = stripe_buf + i * chunk_size;
841
842 while (length > 0) {
843 int disk;
844
845 for (i = 0 ; i < raid_disks ; i++) {
846 lseek64(source[i], offsets[i]+start, 0);
847 read(source[i], stripes[i], chunk_size);
848 }
849 for (i = 0 ; i < data_disks ; i++) {
850 int disk = geo_map(i, start/chunk_size, raid_disks,
851 level, layout);
852 blocks[i] = stripes[disk];
853 printf("%d->%d\n", i, disk);
854 }
855 switch(level) {
856 case 6:
521f349c 857 qsyndrome(p, q, (uint8_t**)blocks, data_disks, chunk_size);
9d0e7840 858 diskP = geo_map(-1, start/chunk_size, raid_disks,
48327135 859 level, layout);
9d0e7840
PS
860 if (memcmp(p, stripes[diskP], chunk_size) != 0) {
861 printf("P(%d) wrong at %llu\n", diskP,
48327135
NB
862 start / chunk_size);
863 }
9d0e7840 864 diskQ = geo_map(-2, start/chunk_size, raid_disks,
48327135 865 level, layout);
9d0e7840
PS
866 if (memcmp(q, stripes[diskQ], chunk_size) != 0) {
867 printf("Q(%d) wrong at %llu\n", diskQ,
48327135
NB
868 start / chunk_size);
869 }
9d0e7840
PS
870 disk = raid6_check_disks(data_disks, start, chunk_size,
871 level, layout, diskP, diskQ,
872 p, q, stripes);
873 if(disk >= 0) {
874 printf("Possible failed disk: %d\n", disk);
875 }
876 if(disk == -2) {
877 printf("Failure detected, but disk unknown\n");
878 }
48327135
NB
879 break;
880 }
881 length -= chunk_size;
882 start += chunk_size;
883 }
884 return 0;
885}
886
e86c9dd6
NB
887unsigned long long getnum(char *str, char **err)
888{
889 char *e;
890 unsigned long long rv = strtoull(str, &e, 10);
891 if (e==str || *e) {
892 *err = str;
893 return 0;
894 }
895 return rv;
896}
897
42129b3f 898char const Name[] = "test_restripe";
ad1a3c2f 899int main(int argc, char *argv[])
e86c9dd6
NB
900{
901 /* save/restore file raid_disks chunk_size level layout start length devices...
902 */
903 int save;
904 int *fds;
905 char *file;
a6288483 906 char *buf;
e86c9dd6
NB
907 int storefd;
908 unsigned long long *offsets;
909 int raid_disks, chunk_size, level, layout;
910 unsigned long long start, length;
911 int i;
912
913 char *err = NULL;
914 if (argc < 10) {
7a862a02 915 fprintf(stderr, "Usage: test_stripe save/restore file raid_disks chunk_size level layout start length devices...\n");
e86c9dd6
NB
916 exit(1);
917 }
918 if (strcmp(argv[1], "save")==0)
919 save = 1;
920 else if (strcmp(argv[1], "restore") == 0)
921 save = 0;
48327135
NB
922 else if (strcmp(argv[1], "test") == 0)
923 save = 2;
e86c9dd6
NB
924 else {
925 fprintf(stderr, "test_stripe: must give 'save' or 'restore'.\n");
926 exit(2);
927 }
928
929 file = argv[2];
930 raid_disks = getnum(argv[3], &err);
931 chunk_size = getnum(argv[4], &err);
932 level = getnum(argv[5], &err);
933 layout = getnum(argv[6], &err);
934 start = getnum(argv[7], &err);
935 length = getnum(argv[8], &err);
936 if (err) {
937 fprintf(stderr, "test_stripe: Bad number: %s\n", err);
938 exit(2);
939 }
940 if (argc != raid_disks + 9) {
941 fprintf(stderr, "test_stripe: wrong number of devices: want %d found %d\n",
942 raid_disks, argc-9);
943 exit(2);
944 }
503975b9
N
945 fds = xmalloc(raid_disks * sizeof(*fds));
946 offsets = xcalloc(raid_disks, sizeof(*offsets));
e86c9dd6
NB
947
948 storefd = open(file, O_RDWR);
949 if (storefd < 0) {
950 perror(file);
951 fprintf(stderr, "test_stripe: could not open %s.\n", file);
952 exit(3);
953 }
954 for (i=0; i<raid_disks; i++) {
6f38d7ae
PS
955 char *p;
956 p = strchr(argv[9+i], ':');
957
958 if(p != NULL) {
959 *p++ = '\0';
960 offsets[i] = atoll(p) * 512;
961 }
1011e834 962
e86c9dd6
NB
963 fds[i] = open(argv[9+i], O_RDWR);
964 if (fds[i] < 0) {
965 perror(argv[9+i]);
966 fprintf(stderr,"test_stripe: cannot open %s.\n", argv[9+i]);
967 exit(3);
968 }
969 }
970
503975b9 971 buf = xmalloc(raid_disks * chunk_size);
a6288483 972
48327135 973 if (save == 1) {
e86c9dd6
NB
974 int rv = save_stripes(fds, offsets,
975 raid_disks, chunk_size, level, layout,
976 1, &storefd,
a6288483 977 start, length, buf);
e86c9dd6 978 if (rv != 0) {
48327135
NB
979 fprintf(stderr,
980 "test_stripe: save_stripes returned %d\n", rv);
981 exit(1);
982 }
983 } else if (save == 2) {
984 int rv = test_stripes(fds, offsets,
985 raid_disks, chunk_size, level, layout,
986 start, length);
987 if (rv != 0) {
988 fprintf(stderr,
989 "test_stripe: test_stripes returned %d\n", rv);
e86c9dd6
NB
990 exit(1);
991 }
992 } else {
993 int rv = restore_stripes(fds, offsets,
994 raid_disks, chunk_size, level, layout,
353632d9 995 storefd, 0ULL,
c071a1cd 996 start, length, NULL);
e86c9dd6 997 if (rv != 0) {
48327135
NB
998 fprintf(stderr,
999 "test_stripe: restore_stripes returned %d\n",
1000 rv);
e86c9dd6
NB
1001 exit(1);
1002 }
1003 }
1004 exit(0);
1005}
1006
1007#endif /* MAIN */