]> git.ipfire.org Git - thirdparty/mdadm.git/blame - restripe.c
mdadm: improve the dlm locking mechanism for clustered raid
[thirdparty/mdadm.git] / restripe.c
CommitLineData
e86c9dd6
NB
1/*
2 * mdadm - manage Linux "md" devices aka RAID arrays.
3 *
e736b623 4 * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de>
e86c9dd6
NB
5 *
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 *
21 * Author: Neil Brown
22 * Email: <neilb@suse.de>
23 */
24
25#include "mdadm.h"
a6288483 26#include <stdint.h>
e86c9dd6
NB
27
28/* To restripe, we read from old geometry to a buffer, and
29 * read from buffer to new geometry.
a6288483
N
30 * When reading, we might have missing devices and so could need
31 * to reconstruct.
32 * When writing, we need to create correct parity and Q.
e86c9dd6
NB
33 *
34 */
35
979afcb8 36int geo_map(int block, unsigned long long stripe, int raid_disks,
e0d95aac 37 int level, int layout)
e86c9dd6 38{
48327135 39 /* On the given stripe, find which disk in the array will have
e86c9dd6 40 * block numbered 'block'.
48327135
NB
41 * '-1' means the parity block.
42 * '-2' means the Q syndrome.
e86c9dd6
NB
43 */
44 int pd;
45
b6e317c8
AK
46 /* layout is not relevant for raid0 and raid4 */
47 if ((level == 0) ||
48 (level == 4))
49 layout = 0;
50
e86c9dd6
NB
51 switch(level*100 + layout) {
52 case 000:
53 case 400:
e0d95aac 54 case 500 + ALGORITHM_PARITY_N:
e86c9dd6
NB
55 /* raid 4 isn't messed around by parity blocks */
56 if (block == -1)
57 return raid_disks-1; /* parity block */
58 return block;
59 case 500 + ALGORITHM_LEFT_ASYMMETRIC:
60 pd = (raid_disks-1) - stripe % raid_disks;
f1bbb5ff
JS
61 if (block == -1)
62 return pd;
e86c9dd6
NB
63 if (block >= pd)
64 block++;
65 return block;
66
67 case 500 + ALGORITHM_RIGHT_ASYMMETRIC:
68 pd = stripe % raid_disks;
f1bbb5ff
JS
69 if (block == -1)
70 return pd;
e86c9dd6
NB
71 if (block >= pd)
72 block++;
73 return block;
74
75 case 500 + ALGORITHM_LEFT_SYMMETRIC:
76 pd = (raid_disks - 1) - stripe % raid_disks;
f1bbb5ff
JS
77 if (block == -1)
78 return pd;
e86c9dd6
NB
79 return (pd + 1 + block) % raid_disks;
80
81 case 500 + ALGORITHM_RIGHT_SYMMETRIC:
82 pd = stripe % raid_disks;
f1bbb5ff
JS
83 if (block == -1)
84 return pd;
e86c9dd6
NB
85 return (pd + 1 + block) % raid_disks;
86
e0d95aac
N
87 case 500 + ALGORITHM_PARITY_0:
88 return block + 1;
89
e0d95aac
N
90 case 600 + ALGORITHM_PARITY_N_6:
91 if (block == -2)
92 return raid_disks - 1;
93 if (block == -1)
94 return raid_disks - 2; /* parity block */
95 return block;
96 case 600 + ALGORITHM_LEFT_ASYMMETRIC_6:
97 if (block == -2)
98 return raid_disks - 1;
99 raid_disks--;
100 pd = (raid_disks-1) - stripe % raid_disks;
f1bbb5ff
JS
101 if (block == -1)
102 return pd;
e0d95aac
N
103 if (block >= pd)
104 block++;
105 return block;
106
107 case 600 + ALGORITHM_RIGHT_ASYMMETRIC_6:
108 if (block == -2)
109 return raid_disks - 1;
110 raid_disks--;
111 pd = stripe % raid_disks;
f1bbb5ff
JS
112 if (block == -1)
113 return pd;
e0d95aac
N
114 if (block >= pd)
115 block++;
116 return block;
117
118 case 600 + ALGORITHM_LEFT_SYMMETRIC_6:
119 if (block == -2)
120 return raid_disks - 1;
121 raid_disks--;
122 pd = (raid_disks - 1) - stripe % raid_disks;
f1bbb5ff
JS
123 if (block == -1)
124 return pd;
e0d95aac
N
125 return (pd + 1 + block) % raid_disks;
126
127 case 600 + ALGORITHM_RIGHT_SYMMETRIC_6:
128 if (block == -2)
129 return raid_disks - 1;
130 raid_disks--;
131 pd = stripe % raid_disks;
f1bbb5ff
JS
132 if (block == -1)
133 return pd;
e0d95aac
N
134 return (pd + 1 + block) % raid_disks;
135
136 case 600 + ALGORITHM_PARITY_0_6:
137 if (block == -2)
138 return raid_disks - 1;
139 return block + 1;
140
e0d95aac
N
141 case 600 + ALGORITHM_PARITY_0:
142 if (block == -1)
143 return 0;
144 if (block == -2)
145 return 1;
146 return block + 2;
147
e86c9dd6
NB
148 case 600 + ALGORITHM_LEFT_ASYMMETRIC:
149 pd = raid_disks - 1 - (stripe % raid_disks);
f1bbb5ff
JS
150 if (block == -1)
151 return pd;
152 if (block == -2)
153 return (pd+1) % raid_disks;
e86c9dd6
NB
154 if (pd == raid_disks - 1)
155 return block+1;
156 if (block >= pd)
157 return block+2;
158 return block;
159
e0d95aac
N
160 case 600 + ALGORITHM_ROTATING_ZERO_RESTART:
161 /* Different order for calculating Q, otherwize same as ... */
e86c9dd6
NB
162 case 600 + ALGORITHM_RIGHT_ASYMMETRIC:
163 pd = stripe % raid_disks;
f1bbb5ff
JS
164 if (block == -1)
165 return pd;
166 if (block == -2)
167 return (pd+1) % raid_disks;
e86c9dd6
NB
168 if (pd == raid_disks - 1)
169 return block+1;
170 if (block >= pd)
171 return block+2;
172 return block;
173
174 case 600 + ALGORITHM_LEFT_SYMMETRIC:
175 pd = raid_disks - 1 - (stripe % raid_disks);
f1bbb5ff
JS
176 if (block == -1)
177 return pd;
178 if (block == -2)
179 return (pd+1) % raid_disks;
e86c9dd6
NB
180 return (pd + 2 + block) % raid_disks;
181
182 case 600 + ALGORITHM_RIGHT_SYMMETRIC:
183 pd = stripe % raid_disks;
f1bbb5ff
JS
184 if (block == -1)
185 return pd;
186 if (block == -2)
187 return (pd+1) % raid_disks;
e86c9dd6 188 return (pd + 2 + block) % raid_disks;
e0d95aac 189
e0d95aac
N
190 case 600 + ALGORITHM_ROTATING_N_RESTART:
191 /* Same a left_asymmetric, by first stripe is
192 * D D D P Q rather than
193 * Q D D D P
194 */
195 pd = raid_disks - 1 - ((stripe + 1) % raid_disks);
f1bbb5ff
JS
196 if (block == -1)
197 return pd;
198 if (block == -2)
199 return (pd+1) % raid_disks;
e0d95aac
N
200 if (pd == raid_disks - 1)
201 return block+1;
202 if (block >= pd)
203 return block+2;
204 return block;
205
206 case 600 + ALGORITHM_ROTATING_N_CONTINUE:
207 /* Same as left_symmetric but Q is before P */
208 pd = raid_disks - 1 - (stripe % raid_disks);
f1bbb5ff
JS
209 if (block == -1)
210 return pd;
211 if (block == -2)
212 return (pd+raid_disks-1) % raid_disks;
e0d95aac 213 return (pd + 1 + block) % raid_disks;
e86c9dd6
NB
214 }
215 return -1;
216}
ad1a3c2f
N
217
218int is_ddf(int layout)
e0d95aac
N
219{
220 switch (layout)
221 {
222 default:
223 return 0;
224 case ALGORITHM_ROTATING_N_CONTINUE:
225 case ALGORITHM_ROTATING_N_RESTART:
226 case ALGORITHM_ROTATING_ZERO_RESTART:
227 return 1;
228 }
229}
e86c9dd6 230
59679536 231void xor_blocks(char *target, char **sources, int disks, int size)
e86c9dd6
NB
232{
233 int i, j;
234 /* Amazingly inefficient... */
235 for (i=0; i<size; i++) {
236 char c = 0;
237 for (j=0 ; j<disks; j++)
238 c ^= sources[j][i];
239 target[i] = c;
240 }
241}
242
979afcb8 243void qsyndrome(uint8_t *p, uint8_t *q, uint8_t **sources, int disks, int size)
48327135
NB
244{
245 int d, z;
a6288483 246 uint8_t wq0, wp0, wd0, w10, w20;
48327135
NB
247 for ( d = 0; d < size; d++) {
248 wq0 = wp0 = sources[disks-1][d];
249 for ( z = disks-2 ; z >= 0 ; z-- ) {
250 wd0 = sources[z][d];
251 wp0 ^= wd0;
252 w20 = (wq0&0x80) ? 0xff : 0x00;
253 w10 = (wq0 << 1) & 0xff;
254 w20 &= 0x1d;
255 w10 ^= w20;
256 wq0 = w10 ^ wd0;
257 }
258 p[d] = wp0;
259 q[d] = wq0;
260 }
261}
262
a6288483
N
263/*
264 * The following was taken from linux/drivers/md/mktables.c, and modified
265 * to create in-memory tables rather than C code
266 */
267static uint8_t gfmul(uint8_t a, uint8_t b)
268{
269 uint8_t v = 0;
270
271 while (b) {
272 if (b & 1)
273 v ^= a;
274 a = (a << 1) ^ (a & 0x80 ? 0x1d : 0);
275 b >>= 1;
276 }
277
278 return v;
279}
280
281static uint8_t gfpow(uint8_t a, int b)
282{
283 uint8_t v = 1;
284
285 b %= 255;
286 if (b < 0)
287 b += 255;
288
289 while (b) {
290 if (b & 1)
291 v = gfmul(v, a);
292 a = gfmul(a, a);
293 b >>= 1;
294 }
295
296 return v;
297}
298
299int tables_ready = 0;
300uint8_t raid6_gfmul[256][256];
301uint8_t raid6_gfexp[256];
302uint8_t raid6_gfinv[256];
303uint8_t raid6_gfexi[256];
9d0e7840
PS
304uint8_t raid6_gflog[256];
305uint8_t raid6_gfilog[256];
a6288483
N
306void make_tables(void)
307{
308 int i, j;
309 uint8_t v;
9d0e7840 310 uint32_t b, log;
a6288483
N
311
312 /* Compute multiplication table */
313 for (i = 0; i < 256; i++)
314 for (j = 0; j < 256; j++)
315 raid6_gfmul[i][j] = gfmul(i, j);
316
317 /* Compute power-of-2 table (exponent) */
318 v = 1;
319 for (i = 0; i < 256; i++) {
320 raid6_gfexp[i] = v;
321 v = gfmul(v, 2);
322 if (v == 1)
323 v = 0; /* For entry 255, not a real entry */
324 }
325
326 /* Compute inverse table x^-1 == x^254 */
327 for (i = 0; i < 256; i++)
328 raid6_gfinv[i] = gfpow(i, 254);
329
330 /* Compute inv(2^x + 1) (exponent-xor-inverse) table */
331 for (i = 0; i < 256; i ++)
332 raid6_gfexi[i] = raid6_gfinv[raid6_gfexp[i] ^ 1];
333
9d0e7840
PS
334 /* Compute log and inverse log */
335 /* Modified code from:
336 * http://web.eecs.utk.edu/~plank/plank/papers/CS-96-332.html
337 */
338 b = 1;
339 raid6_gflog[0] = 0;
340 raid6_gfilog[255] = 0;
341
342 for (log = 0; log < 255; log++) {
343 raid6_gflog[b] = (uint8_t) log;
344 raid6_gfilog[log] = (uint8_t) b;
345 b = b << 1;
346 if (b & 256) b = b ^ 0435;
347 }
348
a6288483
N
349 tables_ready = 1;
350}
351
352uint8_t *zero;
d47a2925 353int zero_size;
59679536
RB
354
355void ensure_zero_has_size(int chunk_size)
356{
357 if (zero == NULL || chunk_size > zero_size) {
358 if (zero)
359 free(zero);
360 zero = xcalloc(1, chunk_size);
361 zero_size = chunk_size;
362 }
363}
364
a6288483
N
365/* Following was taken from linux/drivers/md/raid6recov.c */
366
367/* Recover two failed data blocks. */
50786d47 368
a6288483 369void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
50786d47 370 uint8_t **ptrs, int neg_offset)
a6288483
N
371{
372 uint8_t *p, *q, *dp, *dq;
373 uint8_t px, qx, db;
374 const uint8_t *pbmul; /* P multiplier table for B data */
375 const uint8_t *qmul; /* Q multiplier table (for both) */
376
eae01ef0
N
377 if (faila > failb) {
378 int t = faila;
379 faila = failb;
380 failb = t;
381 }
382
50786d47
N
383 if (neg_offset) {
384 p = ptrs[-1];
385 q = ptrs[-2];
386 } else {
387 p = ptrs[disks-2];
388 q = ptrs[disks-1];
389 }
a6288483
N
390
391 /* Compute syndrome with zero for the missing data pages
392 Use the dead data pages as temporary storage for
393 delta p and delta q */
394 dp = ptrs[faila];
395 ptrs[faila] = zero;
396 dq = ptrs[failb];
397 ptrs[failb] = zero;
398
399 qsyndrome(dp, dq, ptrs, disks-2, bytes);
400
401 /* Restore pointer table */
402 ptrs[faila] = dp;
403 ptrs[failb] = dq;
404
405 /* Now, pick the proper data tables */
406 pbmul = raid6_gfmul[raid6_gfexi[failb-faila]];
407 qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]];
408
409 /* Now do it... */
410 while ( bytes-- ) {
411 px = *p ^ *dp;
412 qx = qmul[*q ^ *dq];
413 *dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */
414 *dp++ = db ^ px; /* Reconstructed A */
415 p++; q++;
416 }
417}
418
419/* Recover failure of one data block plus the P block */
50786d47
N
420void raid6_datap_recov(int disks, size_t bytes, int faila, uint8_t **ptrs,
421 int neg_offset)
a6288483
N
422{
423 uint8_t *p, *q, *dq;
424 const uint8_t *qmul; /* Q multiplier table */
425
50786d47
N
426 if (neg_offset) {
427 p = ptrs[-1];
428 q = ptrs[-2];
429 } else {
430 p = ptrs[disks-2];
431 q = ptrs[disks-1];
432 }
a6288483
N
433
434 /* Compute syndrome with zero for the missing data page
435 Use the dead data page as temporary storage for delta q */
436 dq = ptrs[faila];
437 ptrs[faila] = zero;
438
439 qsyndrome(p, dq, ptrs, disks-2, bytes);
440
441 /* Restore pointer table */
442 ptrs[faila] = dq;
443
444 /* Now, pick the proper data tables */
445 qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]];
446
447 /* Now do it... */
448 while ( bytes-- ) {
449 *p++ ^= *dq = qmul[*q ^ *dq];
450 q++; dq++;
451 }
452}
453
9d0e7840
PS
454/* Try to find out if a specific disk has a problem */
455int raid6_check_disks(int data_disks, int start, int chunk_size,
456 int level, int layout, int diskP, int diskQ,
ef639064 457 uint8_t *p, uint8_t *q, char **stripes)
9d0e7840
PS
458{
459 int i;
460 int data_id, diskD;
461 uint8_t Px, Qx;
462 int curr_broken_disk = -1;
463 int prev_broken_disk = -1;
464 int broken_status = 0;
465
466 for(i = 0; i < chunk_size; i++) {
467 Px = (uint8_t)stripes[diskP][i] ^ (uint8_t)p[i];
468 Qx = (uint8_t)stripes[diskQ][i] ^ (uint8_t)q[i];
469
470 if((Px != 0) && (Qx == 0))
471 curr_broken_disk = diskP;
472
9d0e7840
PS
473 if((Px == 0) && (Qx != 0))
474 curr_broken_disk = diskQ;
475
9d0e7840 476 if((Px != 0) && (Qx != 0)) {
c4db5301
PS
477 data_id = (raid6_gflog[Qx] - raid6_gflog[Px]);
478 if(data_id < 0) data_id += 255;
9d0e7840
PS
479 diskD = geo_map(data_id, start/chunk_size,
480 data_disks + 2, level, layout);
481 curr_broken_disk = diskD;
482 }
483
484 if((Px == 0) && (Qx == 0))
681b7ae2 485 curr_broken_disk = prev_broken_disk;
9d0e7840 486
c4db5301
PS
487 if(curr_broken_disk >= data_disks + 2)
488 broken_status = 2;
489
9d0e7840
PS
490 switch(broken_status) {
491 case 0:
492 if(curr_broken_disk != -1) {
493 prev_broken_disk = curr_broken_disk;
494 broken_status = 1;
495 }
496 break;
497
498 case 1:
499 if(curr_broken_disk != prev_broken_disk)
500 broken_status = 2;
9d0e7840
PS
501 break;
502
503 case 2:
504 default:
505 curr_broken_disk = prev_broken_disk = -2;
506 break;
507 }
508 }
509
510 return curr_broken_disk;
511}
512
2fcb75ae
AK
513/*******************************************************************************
514 * Function: save_stripes
515 * Description:
516 * Function reads data (only data without P and Q) from array and writes
517 * it to buf and opcjonaly to backup files
518 * Parameters:
519 * source : A list of 'fds' of the active disks.
520 * Some may be absent
521 * offsets : A list of offsets on disk belonging
522 * to the array [bytes]
523 * raid_disks : geometry: number of disks in the array
524 * chunk_size : geometry: chunk size [bytes]
525 * level : geometry: RAID level
526 * layout : geometry: layout
527 * nwrites : number of backup files
528 * dest : A list of 'fds' for mirrored targets
529 * (e.g. backup files). They are already seeked to right
530 * (write) location. If NULL, data will be wrote
531 * to the buf only
532 * start : start address of data to read (must be stripe-aligned)
533 * [bytes]
534 * length - : length of data to read (must be stripe-aligned)
535 * [bytes]
536 * buf : buffer for data. It is large enough to hold
537 * one stripe. It is stripe aligned
538 * Returns:
539 * 0 : success
540 * -1 : fail
541 ******************************************************************************/
e86c9dd6
NB
542int save_stripes(int *source, unsigned long long *offsets,
543 int raid_disks, int chunk_size, int level, int layout,
544 int nwrites, int *dest,
a6288483
N
545 unsigned long long start, unsigned long long length,
546 char *buf)
e86c9dd6 547{
e86c9dd6
NB
548 int len;
549 int data_disks = raid_disks - (level == 0 ? 0 : level <=5 ? 1 : 2);
550 int disk;
a6288483 551 int i;
2fcb75ae 552 unsigned long long length_test;
e86c9dd6 553
a6288483
N
554 if (!tables_ready)
555 make_tables();
59679536 556 ensure_zero_has_size(chunk_size);
a6288483
N
557
558 len = data_disks * chunk_size;
2fcb75ae
AK
559 length_test = length / len;
560 length_test *= len;
561
562 if (length != length_test) {
563 dprintf("Error: save_stripes(): Data are not alligned. EXIT\n");
564 dprintf("\tArea for saving stripes (length) = %llu\n", length);
565 dprintf("\tWork step (len) = %i\n", len);
566 dprintf("\tExpected save area (length_test) = %llu\n",
567 length_test);
568 abort();
569 }
570
e86c9dd6 571 while (length > 0) {
a6288483
N
572 int failed = 0;
573 int fdisk[3], fblock[3];
574 for (disk = 0; disk < raid_disks ; disk++) {
575 unsigned long long offset;
576 int dnum;
a6288483
N
577
578 offset = (start/chunk_size/data_disks)*chunk_size;
579 dnum = geo_map(disk < data_disks ? disk : data_disks - disk - 1,
580 start/chunk_size/data_disks,
581 raid_disks, level, layout);
7236ee7a 582 if (dnum < 0) abort();
a6288483 583 if (source[dnum] < 0 ||
d16a7494
JS
584 lseek64(source[dnum],
585 offsets[dnum] + offset, 0) < 0 ||
586 read(source[dnum], buf+disk * chunk_size,
587 chunk_size) != chunk_size) {
a6288483
N
588 if (failed <= 2) {
589 fdisk[failed] = dnum;
590 fblock[failed] = disk;
591 failed++;
592 }
d16a7494 593 }
a6288483
N
594 }
595 if (failed == 0 || fblock[0] >= data_disks)
596 /* all data disks are good */
597 ;
598 else if (failed == 1 || fblock[1] >= data_disks+1) {
599 /* one failed data disk and good parity */
600 char *bufs[data_disks];
601 for (i=0; i < data_disks; i++)
602 if (fblock[0] == i)
603 bufs[i] = buf + data_disks*chunk_size;
604 else
605 bufs[i] = buf + i*chunk_size;
606
607 xor_blocks(buf + fblock[0]*chunk_size,
608 bufs, data_disks, chunk_size);
609 } else if (failed > 2 || level != 6)
610 /* too much failure */
e86c9dd6 611 return -1;
a6288483
N
612 else {
613 /* RAID6 computations needed. */
614 uint8_t *bufs[data_disks+4];
615 int qdisk;
616 int syndrome_disks;
617 disk = geo_map(-1, start/chunk_size/data_disks,
618 raid_disks, level, layout);
619 qdisk = geo_map(-2, start/chunk_size/data_disks,
620 raid_disks, level, layout);
621 if (is_ddf(layout)) {
622 /* q over 'raid_disks' blocks, in device order.
623 * 'p' and 'q' get to be all zero
624 */
625 for (i = 0; i < raid_disks; i++)
cc50ccdc
N
626 bufs[i] = zero;
627 for (i = 0; i < data_disks; i++) {
628 int dnum = geo_map(i,
629 start/chunk_size/data_disks,
630 raid_disks, level, layout);
631 int snum;
632 /* i is the logical block number, so is index to 'buf'.
633 * dnum is physical disk number
634 * and thus the syndrome number.
635 */
636 snum = dnum;
637 bufs[snum] = (uint8_t*)buf + chunk_size * i;
638 }
a6288483
N
639 syndrome_disks = raid_disks;
640 } else {
641 /* for md, q is over 'data_disks' blocks,
642 * starting immediately after 'q'
1eac9f84
N
643 * Note that for the '_6' variety, the p block
644 * makes a hole that we need to be careful of.
a6288483 645 */
1eac9f84
N
646 int j;
647 int snum = 0;
648 for (j = 0; j < raid_disks; j++) {
649 int dnum = (qdisk + 1 + j) % raid_disks;
650 if (dnum == disk || dnum == qdisk)
651 continue;
652 for (i = 0; i < data_disks; i++)
653 if (geo_map(i,
654 start/chunk_size/data_disks,
655 raid_disks, level, layout) == dnum)
656 break;
cc50ccdc
N
657 /* i is the logical block number, so is index to 'buf'.
658 * dnum is physical disk number
659 * snum is syndrome disk for which 0 is immediately after Q
660 */
cc50ccdc 661 bufs[snum] = (uint8_t*)buf + chunk_size * i;
1eac9f84
N
662
663 if (fblock[0] == i)
664 fdisk[0] = snum;
665 if (fblock[1] == i)
666 fdisk[1] = snum;
667 snum++;
cc50ccdc 668 }
a6288483 669
a6288483
N
670 syndrome_disks = data_disks;
671 }
cc50ccdc
N
672
673 /* Place P and Q blocks at end of bufs */
674 bufs[syndrome_disks] = (uint8_t*)buf + chunk_size * data_disks;
675 bufs[syndrome_disks+1] = (uint8_t*)buf + chunk_size * (data_disks+1);
676
a6288483
N
677 if (fblock[1] == data_disks)
678 /* One data failed, and parity failed */
679 raid6_datap_recov(syndrome_disks+2, chunk_size,
50786d47 680 fdisk[0], bufs, 0);
cc50ccdc 681 else {
a6288483
N
682 /* Two data blocks failed, P,Q OK */
683 raid6_2data_recov(syndrome_disks+2, chunk_size,
50786d47 684 fdisk[0], fdisk[1], bufs, 0);
cc50ccdc 685 }
a6288483 686 }
ccced3dc 687 if (dest) {
2fcb75ae
AK
688 for (i = 0; i < nwrites; i++)
689 if (write(dest[i], buf, len) != len)
690 return -1;
ccced3dc
AK
691 } else {
692 /* build next stripe in buffer */
693 buf += len;
694 }
e86c9dd6
NB
695 length -= len;
696 start += len;
e86c9dd6
NB
697 }
698 return 0;
699}
700
701/* Restore data:
702 * We are given:
703 * A list of 'fds' of the active disks. Some may be '-1' for not-available.
353632d9 704 * A geometry: raid_disks, chunk_size, level, layout
e86c9dd6
NB
705 * An 'fd' to read from. It is already seeked to the right (Read) location.
706 * A start and length.
707 * The length must be a multiple of the stripe size.
708 *
709 * We build a full stripe in memory and then write it out.
710 * We assume that there are enough working devices.
711 */
712int restore_stripes(int *dest, unsigned long long *offsets,
713 int raid_disks, int chunk_size, int level, int layout,
353632d9 714 int source, unsigned long long read_offset,
2fcb75ae
AK
715 unsigned long long start, unsigned long long length,
716 char *src_buf)
e86c9dd6 717{
e9e43ec3 718 char *stripe_buf;
503975b9
N
719 char **stripes = xmalloc(raid_disks * sizeof(char*));
720 char **blocks = xmalloc(raid_disks * sizeof(char*));
e86c9dd6 721 int i;
758be4f1 722 int rv;
e86c9dd6 723
a6288483 724 int data_disks = raid_disks - (level == 0 ? 0 : level <= 5 ? 1 : 2);
e86c9dd6 725
fcf57625
N
726 if (posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size))
727 stripe_buf = NULL;
d47a2925
N
728
729 if (zero == NULL || chunk_size > zero_size) {
730 if (zero)
731 free(zero);
503975b9 732 zero = xcalloc(1, chunk_size);
d47a2925 733 zero_size = chunk_size;
a6288483 734 }
d47a2925 735
d7be7d87
JS
736 if (stripe_buf == NULL || stripes == NULL || blocks == NULL ||
737 zero == NULL) {
758be4f1
LD
738 rv = -2;
739 goto abort;
e86c9dd6 740 }
2fcb75ae 741 for (i = 0; i < raid_disks; i++)
e86c9dd6
NB
742 stripes[i] = stripe_buf + i * chunk_size;
743 while (length > 0) {
f21e18ca 744 unsigned int len = data_disks * chunk_size;
e86c9dd6 745 unsigned long long offset;
48327135 746 int disk, qdisk;
a6288483 747 int syndrome_disks;
758be4f1
LD
748 if (length < len) {
749 rv = -3;
750 goto abort;
751 }
2fcb75ae 752 for (i = 0; i < data_disks; i++) {
e86c9dd6
NB
753 int disk = geo_map(i, start/chunk_size/data_disks,
754 raid_disks, level, layout);
2fcb75ae
AK
755 if (src_buf == NULL) {
756 /* read from file */
758be4f1
LD
757 if (lseek64(source, read_offset, 0) !=
758 (off64_t)read_offset) {
759 rv = -1;
760 goto abort;
761 }
2fcb75ae
AK
762 if (read(source,
763 stripes[disk],
758be4f1
LD
764 chunk_size) != chunk_size) {
765 rv = -1;
766 goto abort;
767 }
2fcb75ae
AK
768 } else {
769 /* read from input buffer */
770 memcpy(stripes[disk],
771 src_buf + read_offset,
772 chunk_size);
773 }
353632d9 774 read_offset += chunk_size;
e86c9dd6
NB
775 }
776 /* We have the data, now do the parity */
777 offset = (start/chunk_size/data_disks) * chunk_size;
48327135
NB
778 switch (level) {
779 case 4:
780 case 5:
781 disk = geo_map(-1, start/chunk_size/data_disks,
e86c9dd6 782 raid_disks, level, layout);
e0d95aac
N
783 for (i = 0; i < data_disks; i++)
784 blocks[i] = stripes[(disk+1+i) % raid_disks];
e86c9dd6 785 xor_blocks(stripes[disk], blocks, data_disks, chunk_size);
48327135
NB
786 break;
787 case 6:
788 disk = geo_map(-1, start/chunk_size/data_disks,
789 raid_disks, level, layout);
790 qdisk = geo_map(-2, start/chunk_size/data_disks,
791 raid_disks, level, layout);
e0d95aac
N
792 if (is_ddf(layout)) {
793 /* q over 'raid_disks' blocks, in device order.
794 * 'p' and 'q' get to be all zero
795 */
796 for (i = 0; i < raid_disks; i++)
797 if (i == disk || i == qdisk)
a6288483 798 blocks[i] = (char*)zero;
e0d95aac
N
799 else
800 blocks[i] = stripes[i];
a6288483 801 syndrome_disks = raid_disks;
e0d95aac 802 } else {
a6288483 803 /* for md, q is over 'data_disks' blocks,
e0d95aac
N
804 * starting immediately after 'q'
805 */
806 for (i = 0; i < data_disks; i++)
807 blocks[i] = stripes[(qdisk+1+i) % raid_disks];
48327135 808
a6288483 809 syndrome_disks = data_disks;
e0d95aac 810 }
a6288483 811 qsyndrome((uint8_t*)stripes[disk],
1011e834 812 (uint8_t*)stripes[qdisk],
a6288483
N
813 (uint8_t**)blocks,
814 syndrome_disks, chunk_size);
48327135 815 break;
e86c9dd6
NB
816 }
817 for (i=0; i < raid_disks ; i++)
818 if (dest[i] >= 0) {
758be4f1
LD
819 if (lseek64(dest[i],
820 offsets[i]+offset, 0) < 0) {
821 rv = -1;
822 goto abort;
823 }
824 if (write(dest[i], stripes[i],
825 chunk_size) != chunk_size) {
826 rv = -1;
827 goto abort;
828 }
e86c9dd6
NB
829 }
830 length -= len;
831 start += len;
832 }
758be4f1
LD
833 rv = 0;
834
835abort:
836 free(stripe_buf);
837 free(stripes);
838 free(blocks);
839 return rv;
e86c9dd6
NB
840}
841
842#ifdef MAIN
843
48327135
NB
844int test_stripes(int *source, unsigned long long *offsets,
845 int raid_disks, int chunk_size, int level, int layout,
846 unsigned long long start, unsigned long long length)
847{
848 /* ready the data and p (and q) blocks, and check we got them right */
503975b9
N
849 char *stripe_buf = xmalloc(raid_disks * chunk_size);
850 char **stripes = xmalloc(raid_disks * sizeof(char*));
851 char **blocks = xmalloc(raid_disks * sizeof(char*));
ef639064
N
852 uint8_t *p = xmalloc(chunk_size);
853 uint8_t *q = xmalloc(chunk_size);
48327135
NB
854
855 int i;
9d0e7840 856 int diskP, diskQ;
48327135 857 int data_disks = raid_disks - (level == 5 ? 1: 2);
9d0e7840
PS
858
859 if (!tables_ready)
860 make_tables();
861
48327135
NB
862 for ( i = 0 ; i < raid_disks ; i++)
863 stripes[i] = stripe_buf + i * chunk_size;
864
865 while (length > 0) {
866 int disk;
867
868 for (i = 0 ; i < raid_disks ; i++) {
869 lseek64(source[i], offsets[i]+start, 0);
870 read(source[i], stripes[i], chunk_size);
871 }
872 for (i = 0 ; i < data_disks ; i++) {
873 int disk = geo_map(i, start/chunk_size, raid_disks,
874 level, layout);
875 blocks[i] = stripes[disk];
876 printf("%d->%d\n", i, disk);
877 }
878 switch(level) {
879 case 6:
521f349c 880 qsyndrome(p, q, (uint8_t**)blocks, data_disks, chunk_size);
9d0e7840 881 diskP = geo_map(-1, start/chunk_size, raid_disks,
48327135 882 level, layout);
9d0e7840
PS
883 if (memcmp(p, stripes[diskP], chunk_size) != 0) {
884 printf("P(%d) wrong at %llu\n", diskP,
48327135
NB
885 start / chunk_size);
886 }
9d0e7840 887 diskQ = geo_map(-2, start/chunk_size, raid_disks,
48327135 888 level, layout);
9d0e7840
PS
889 if (memcmp(q, stripes[diskQ], chunk_size) != 0) {
890 printf("Q(%d) wrong at %llu\n", diskQ,
48327135
NB
891 start / chunk_size);
892 }
9d0e7840
PS
893 disk = raid6_check_disks(data_disks, start, chunk_size,
894 level, layout, diskP, diskQ,
895 p, q, stripes);
896 if(disk >= 0) {
897 printf("Possible failed disk: %d\n", disk);
898 }
899 if(disk == -2) {
900 printf("Failure detected, but disk unknown\n");
901 }
48327135
NB
902 break;
903 }
904 length -= chunk_size;
905 start += chunk_size;
906 }
907 return 0;
908}
909
e86c9dd6
NB
910unsigned long long getnum(char *str, char **err)
911{
912 char *e;
913 unsigned long long rv = strtoull(str, &e, 10);
914 if (e==str || *e) {
915 *err = str;
916 return 0;
917 }
918 return rv;
919}
920
42129b3f 921char const Name[] = "test_restripe";
ad1a3c2f 922int main(int argc, char *argv[])
e86c9dd6
NB
923{
924 /* save/restore file raid_disks chunk_size level layout start length devices...
925 */
926 int save;
927 int *fds;
928 char *file;
a6288483 929 char *buf;
e86c9dd6
NB
930 int storefd;
931 unsigned long long *offsets;
932 int raid_disks, chunk_size, level, layout;
933 unsigned long long start, length;
934 int i;
935
936 char *err = NULL;
937 if (argc < 10) {
7a862a02 938 fprintf(stderr, "Usage: test_stripe save/restore file raid_disks chunk_size level layout start length devices...\n");
e86c9dd6
NB
939 exit(1);
940 }
941 if (strcmp(argv[1], "save")==0)
942 save = 1;
943 else if (strcmp(argv[1], "restore") == 0)
944 save = 0;
48327135
NB
945 else if (strcmp(argv[1], "test") == 0)
946 save = 2;
e86c9dd6
NB
947 else {
948 fprintf(stderr, "test_stripe: must give 'save' or 'restore'.\n");
949 exit(2);
950 }
951
952 file = argv[2];
953 raid_disks = getnum(argv[3], &err);
954 chunk_size = getnum(argv[4], &err);
955 level = getnum(argv[5], &err);
956 layout = getnum(argv[6], &err);
957 start = getnum(argv[7], &err);
958 length = getnum(argv[8], &err);
959 if (err) {
960 fprintf(stderr, "test_stripe: Bad number: %s\n", err);
961 exit(2);
962 }
963 if (argc != raid_disks + 9) {
964 fprintf(stderr, "test_stripe: wrong number of devices: want %d found %d\n",
965 raid_disks, argc-9);
966 exit(2);
967 }
503975b9
N
968 fds = xmalloc(raid_disks * sizeof(*fds));
969 offsets = xcalloc(raid_disks, sizeof(*offsets));
e86c9dd6
NB
970
971 storefd = open(file, O_RDWR);
972 if (storefd < 0) {
973 perror(file);
974 fprintf(stderr, "test_stripe: could not open %s.\n", file);
975 exit(3);
976 }
977 for (i=0; i<raid_disks; i++) {
6f38d7ae
PS
978 char *p;
979 p = strchr(argv[9+i], ':');
980
981 if(p != NULL) {
982 *p++ = '\0';
983 offsets[i] = atoll(p) * 512;
984 }
1011e834 985
e86c9dd6
NB
986 fds[i] = open(argv[9+i], O_RDWR);
987 if (fds[i] < 0) {
988 perror(argv[9+i]);
989 fprintf(stderr,"test_stripe: cannot open %s.\n", argv[9+i]);
990 exit(3);
991 }
992 }
993
503975b9 994 buf = xmalloc(raid_disks * chunk_size);
a6288483 995
48327135 996 if (save == 1) {
e86c9dd6
NB
997 int rv = save_stripes(fds, offsets,
998 raid_disks, chunk_size, level, layout,
999 1, &storefd,
a6288483 1000 start, length, buf);
e86c9dd6 1001 if (rv != 0) {
48327135
NB
1002 fprintf(stderr,
1003 "test_stripe: save_stripes returned %d\n", rv);
1004 exit(1);
1005 }
1006 } else if (save == 2) {
1007 int rv = test_stripes(fds, offsets,
1008 raid_disks, chunk_size, level, layout,
1009 start, length);
1010 if (rv != 0) {
1011 fprintf(stderr,
1012 "test_stripe: test_stripes returned %d\n", rv);
e86c9dd6
NB
1013 exit(1);
1014 }
1015 } else {
1016 int rv = restore_stripes(fds, offsets,
1017 raid_disks, chunk_size, level, layout,
353632d9 1018 storefd, 0ULL,
c071a1cd 1019 start, length, NULL);
e86c9dd6 1020 if (rv != 0) {
48327135
NB
1021 fprintf(stderr,
1022 "test_stripe: restore_stripes returned %d\n",
1023 rv);
e86c9dd6
NB
1024 exit(1);
1025 }
1026 }
1027 exit(0);
1028}
1029
1030#endif /* MAIN */