]>
Commit | Line | Data |
---|---|---|
e86c9dd6 NB |
1 | /* |
2 | * mdadm - manage Linux "md" devices aka RAID arrays. | |
3 | * | |
e736b623 | 4 | * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de> |
e86c9dd6 NB |
5 | * |
6 | * | |
7 | * This program is free software; you can redistribute it and/or modify | |
8 | * it under the terms of the GNU General Public License as published by | |
9 | * the Free Software Foundation; either version 2 of the License, or | |
10 | * (at your option) any later version. | |
11 | * | |
12 | * This program is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | * GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with this program; if not, write to the Free Software | |
19 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
20 | * | |
21 | * Author: Neil Brown | |
22 | * Email: <neilb@suse.de> | |
23 | */ | |
24 | ||
25 | #include "mdadm.h" | |
a6288483 | 26 | #include <stdint.h> |
e86c9dd6 NB |
27 | |
28 | /* To restripe, we read from old geometry to a buffer, and | |
29 | * read from buffer to new geometry. | |
a6288483 N |
30 | * When reading, we might have missing devices and so could need |
31 | * to reconstruct. | |
32 | * When writing, we need to create correct parity and Q. | |
e86c9dd6 NB |
33 | * |
34 | */ | |
35 | ||
979afcb8 | 36 | int geo_map(int block, unsigned long long stripe, int raid_disks, |
e0d95aac | 37 | int level, int layout) |
e86c9dd6 | 38 | { |
48327135 | 39 | /* On the given stripe, find which disk in the array will have |
e86c9dd6 | 40 | * block numbered 'block'. |
48327135 NB |
41 | * '-1' means the parity block. |
42 | * '-2' means the Q syndrome. | |
e86c9dd6 NB |
43 | */ |
44 | int pd; | |
45 | ||
b6e317c8 AK |
46 | /* layout is not relevant for raid0 and raid4 */ |
47 | if ((level == 0) || | |
48 | (level == 4)) | |
49 | layout = 0; | |
50 | ||
e86c9dd6 NB |
51 | switch(level*100 + layout) { |
52 | case 000: | |
53 | case 400: | |
e0d95aac | 54 | case 500 + ALGORITHM_PARITY_N: |
e86c9dd6 NB |
55 | /* raid 4 isn't messed around by parity blocks */ |
56 | if (block == -1) | |
57 | return raid_disks-1; /* parity block */ | |
58 | return block; | |
59 | case 500 + ALGORITHM_LEFT_ASYMMETRIC: | |
60 | pd = (raid_disks-1) - stripe % raid_disks; | |
f1bbb5ff JS |
61 | if (block == -1) |
62 | return pd; | |
e86c9dd6 NB |
63 | if (block >= pd) |
64 | block++; | |
65 | return block; | |
66 | ||
67 | case 500 + ALGORITHM_RIGHT_ASYMMETRIC: | |
68 | pd = stripe % raid_disks; | |
f1bbb5ff JS |
69 | if (block == -1) |
70 | return pd; | |
e86c9dd6 NB |
71 | if (block >= pd) |
72 | block++; | |
73 | return block; | |
74 | ||
75 | case 500 + ALGORITHM_LEFT_SYMMETRIC: | |
76 | pd = (raid_disks - 1) - stripe % raid_disks; | |
f1bbb5ff JS |
77 | if (block == -1) |
78 | return pd; | |
e86c9dd6 NB |
79 | return (pd + 1 + block) % raid_disks; |
80 | ||
81 | case 500 + ALGORITHM_RIGHT_SYMMETRIC: | |
82 | pd = stripe % raid_disks; | |
f1bbb5ff JS |
83 | if (block == -1) |
84 | return pd; | |
e86c9dd6 NB |
85 | return (pd + 1 + block) % raid_disks; |
86 | ||
e0d95aac N |
87 | case 500 + ALGORITHM_PARITY_0: |
88 | return block + 1; | |
89 | ||
e0d95aac N |
90 | case 600 + ALGORITHM_PARITY_N_6: |
91 | if (block == -2) | |
92 | return raid_disks - 1; | |
93 | if (block == -1) | |
94 | return raid_disks - 2; /* parity block */ | |
95 | return block; | |
96 | case 600 + ALGORITHM_LEFT_ASYMMETRIC_6: | |
97 | if (block == -2) | |
98 | return raid_disks - 1; | |
99 | raid_disks--; | |
100 | pd = (raid_disks-1) - stripe % raid_disks; | |
f1bbb5ff JS |
101 | if (block == -1) |
102 | return pd; | |
e0d95aac N |
103 | if (block >= pd) |
104 | block++; | |
105 | return block; | |
106 | ||
107 | case 600 + ALGORITHM_RIGHT_ASYMMETRIC_6: | |
108 | if (block == -2) | |
109 | return raid_disks - 1; | |
110 | raid_disks--; | |
111 | pd = stripe % raid_disks; | |
f1bbb5ff JS |
112 | if (block == -1) |
113 | return pd; | |
e0d95aac N |
114 | if (block >= pd) |
115 | block++; | |
116 | return block; | |
117 | ||
118 | case 600 + ALGORITHM_LEFT_SYMMETRIC_6: | |
119 | if (block == -2) | |
120 | return raid_disks - 1; | |
121 | raid_disks--; | |
122 | pd = (raid_disks - 1) - stripe % raid_disks; | |
f1bbb5ff JS |
123 | if (block == -1) |
124 | return pd; | |
e0d95aac N |
125 | return (pd + 1 + block) % raid_disks; |
126 | ||
127 | case 600 + ALGORITHM_RIGHT_SYMMETRIC_6: | |
128 | if (block == -2) | |
129 | return raid_disks - 1; | |
130 | raid_disks--; | |
131 | pd = stripe % raid_disks; | |
f1bbb5ff JS |
132 | if (block == -1) |
133 | return pd; | |
e0d95aac N |
134 | return (pd + 1 + block) % raid_disks; |
135 | ||
136 | case 600 + ALGORITHM_PARITY_0_6: | |
137 | if (block == -2) | |
138 | return raid_disks - 1; | |
139 | return block + 1; | |
140 | ||
e0d95aac N |
141 | case 600 + ALGORITHM_PARITY_0: |
142 | if (block == -1) | |
143 | return 0; | |
144 | if (block == -2) | |
145 | return 1; | |
146 | return block + 2; | |
147 | ||
e86c9dd6 NB |
148 | case 600 + ALGORITHM_LEFT_ASYMMETRIC: |
149 | pd = raid_disks - 1 - (stripe % raid_disks); | |
f1bbb5ff JS |
150 | if (block == -1) |
151 | return pd; | |
152 | if (block == -2) | |
153 | return (pd+1) % raid_disks; | |
e86c9dd6 NB |
154 | if (pd == raid_disks - 1) |
155 | return block+1; | |
156 | if (block >= pd) | |
157 | return block+2; | |
158 | return block; | |
159 | ||
e0d95aac N |
160 | case 600 + ALGORITHM_ROTATING_ZERO_RESTART: |
161 | /* Different order for calculating Q, otherwize same as ... */ | |
e86c9dd6 NB |
162 | case 600 + ALGORITHM_RIGHT_ASYMMETRIC: |
163 | pd = stripe % raid_disks; | |
f1bbb5ff JS |
164 | if (block == -1) |
165 | return pd; | |
166 | if (block == -2) | |
167 | return (pd+1) % raid_disks; | |
e86c9dd6 NB |
168 | if (pd == raid_disks - 1) |
169 | return block+1; | |
170 | if (block >= pd) | |
171 | return block+2; | |
172 | return block; | |
173 | ||
174 | case 600 + ALGORITHM_LEFT_SYMMETRIC: | |
175 | pd = raid_disks - 1 - (stripe % raid_disks); | |
f1bbb5ff JS |
176 | if (block == -1) |
177 | return pd; | |
178 | if (block == -2) | |
179 | return (pd+1) % raid_disks; | |
e86c9dd6 NB |
180 | return (pd + 2 + block) % raid_disks; |
181 | ||
182 | case 600 + ALGORITHM_RIGHT_SYMMETRIC: | |
183 | pd = stripe % raid_disks; | |
f1bbb5ff JS |
184 | if (block == -1) |
185 | return pd; | |
186 | if (block == -2) | |
187 | return (pd+1) % raid_disks; | |
e86c9dd6 | 188 | return (pd + 2 + block) % raid_disks; |
e0d95aac | 189 | |
e0d95aac N |
190 | case 600 + ALGORITHM_ROTATING_N_RESTART: |
191 | /* Same a left_asymmetric, by first stripe is | |
192 | * D D D P Q rather than | |
193 | * Q D D D P | |
194 | */ | |
195 | pd = raid_disks - 1 - ((stripe + 1) % raid_disks); | |
f1bbb5ff JS |
196 | if (block == -1) |
197 | return pd; | |
198 | if (block == -2) | |
199 | return (pd+1) % raid_disks; | |
e0d95aac N |
200 | if (pd == raid_disks - 1) |
201 | return block+1; | |
202 | if (block >= pd) | |
203 | return block+2; | |
204 | return block; | |
205 | ||
206 | case 600 + ALGORITHM_ROTATING_N_CONTINUE: | |
207 | /* Same as left_symmetric but Q is before P */ | |
208 | pd = raid_disks - 1 - (stripe % raid_disks); | |
f1bbb5ff JS |
209 | if (block == -1) |
210 | return pd; | |
211 | if (block == -2) | |
212 | return (pd+raid_disks-1) % raid_disks; | |
e0d95aac | 213 | return (pd + 1 + block) % raid_disks; |
e86c9dd6 NB |
214 | } |
215 | return -1; | |
216 | } | |
ad1a3c2f N |
217 | |
218 | int is_ddf(int layout) | |
e0d95aac N |
219 | { |
220 | switch (layout) | |
221 | { | |
222 | default: | |
223 | return 0; | |
224 | case ALGORITHM_ROTATING_N_CONTINUE: | |
225 | case ALGORITHM_ROTATING_N_RESTART: | |
226 | case ALGORITHM_ROTATING_ZERO_RESTART: | |
227 | return 1; | |
228 | } | |
229 | } | |
e86c9dd6 | 230 | |
59679536 | 231 | void xor_blocks(char *target, char **sources, int disks, int size) |
e86c9dd6 NB |
232 | { |
233 | int i, j; | |
234 | /* Amazingly inefficient... */ | |
235 | for (i=0; i<size; i++) { | |
236 | char c = 0; | |
237 | for (j=0 ; j<disks; j++) | |
238 | c ^= sources[j][i]; | |
239 | target[i] = c; | |
240 | } | |
241 | } | |
242 | ||
979afcb8 | 243 | void qsyndrome(uint8_t *p, uint8_t *q, uint8_t **sources, int disks, int size) |
48327135 NB |
244 | { |
245 | int d, z; | |
a6288483 | 246 | uint8_t wq0, wp0, wd0, w10, w20; |
48327135 NB |
247 | for ( d = 0; d < size; d++) { |
248 | wq0 = wp0 = sources[disks-1][d]; | |
249 | for ( z = disks-2 ; z >= 0 ; z-- ) { | |
250 | wd0 = sources[z][d]; | |
251 | wp0 ^= wd0; | |
252 | w20 = (wq0&0x80) ? 0xff : 0x00; | |
253 | w10 = (wq0 << 1) & 0xff; | |
254 | w20 &= 0x1d; | |
255 | w10 ^= w20; | |
256 | wq0 = w10 ^ wd0; | |
257 | } | |
258 | p[d] = wp0; | |
259 | q[d] = wq0; | |
260 | } | |
261 | } | |
262 | ||
a6288483 N |
263 | /* |
264 | * The following was taken from linux/drivers/md/mktables.c, and modified | |
265 | * to create in-memory tables rather than C code | |
266 | */ | |
267 | static uint8_t gfmul(uint8_t a, uint8_t b) | |
268 | { | |
269 | uint8_t v = 0; | |
270 | ||
271 | while (b) { | |
272 | if (b & 1) | |
273 | v ^= a; | |
274 | a = (a << 1) ^ (a & 0x80 ? 0x1d : 0); | |
275 | b >>= 1; | |
276 | } | |
277 | ||
278 | return v; | |
279 | } | |
280 | ||
281 | static uint8_t gfpow(uint8_t a, int b) | |
282 | { | |
283 | uint8_t v = 1; | |
284 | ||
285 | b %= 255; | |
286 | if (b < 0) | |
287 | b += 255; | |
288 | ||
289 | while (b) { | |
290 | if (b & 1) | |
291 | v = gfmul(v, a); | |
292 | a = gfmul(a, a); | |
293 | b >>= 1; | |
294 | } | |
295 | ||
296 | return v; | |
297 | } | |
298 | ||
299 | int tables_ready = 0; | |
300 | uint8_t raid6_gfmul[256][256]; | |
301 | uint8_t raid6_gfexp[256]; | |
302 | uint8_t raid6_gfinv[256]; | |
303 | uint8_t raid6_gfexi[256]; | |
9d0e7840 PS |
304 | uint8_t raid6_gflog[256]; |
305 | uint8_t raid6_gfilog[256]; | |
a6288483 N |
306 | void make_tables(void) |
307 | { | |
308 | int i, j; | |
309 | uint8_t v; | |
9d0e7840 | 310 | uint32_t b, log; |
a6288483 N |
311 | |
312 | /* Compute multiplication table */ | |
313 | for (i = 0; i < 256; i++) | |
314 | for (j = 0; j < 256; j++) | |
315 | raid6_gfmul[i][j] = gfmul(i, j); | |
316 | ||
317 | /* Compute power-of-2 table (exponent) */ | |
318 | v = 1; | |
319 | for (i = 0; i < 256; i++) { | |
320 | raid6_gfexp[i] = v; | |
321 | v = gfmul(v, 2); | |
322 | if (v == 1) | |
323 | v = 0; /* For entry 255, not a real entry */ | |
324 | } | |
325 | ||
326 | /* Compute inverse table x^-1 == x^254 */ | |
327 | for (i = 0; i < 256; i++) | |
328 | raid6_gfinv[i] = gfpow(i, 254); | |
329 | ||
330 | /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ | |
331 | for (i = 0; i < 256; i ++) | |
332 | raid6_gfexi[i] = raid6_gfinv[raid6_gfexp[i] ^ 1]; | |
333 | ||
9d0e7840 PS |
334 | /* Compute log and inverse log */ |
335 | /* Modified code from: | |
336 | * http://web.eecs.utk.edu/~plank/plank/papers/CS-96-332.html | |
337 | */ | |
338 | b = 1; | |
339 | raid6_gflog[0] = 0; | |
340 | raid6_gfilog[255] = 0; | |
341 | ||
342 | for (log = 0; log < 255; log++) { | |
343 | raid6_gflog[b] = (uint8_t) log; | |
344 | raid6_gfilog[log] = (uint8_t) b; | |
345 | b = b << 1; | |
346 | if (b & 256) b = b ^ 0435; | |
347 | } | |
348 | ||
a6288483 N |
349 | tables_ready = 1; |
350 | } | |
351 | ||
352 | uint8_t *zero; | |
d47a2925 | 353 | int zero_size; |
59679536 RB |
354 | |
355 | void ensure_zero_has_size(int chunk_size) | |
356 | { | |
357 | if (zero == NULL || chunk_size > zero_size) { | |
358 | if (zero) | |
359 | free(zero); | |
360 | zero = xcalloc(1, chunk_size); | |
361 | zero_size = chunk_size; | |
362 | } | |
363 | } | |
364 | ||
a6288483 N |
365 | /* Following was taken from linux/drivers/md/raid6recov.c */ |
366 | ||
367 | /* Recover two failed data blocks. */ | |
50786d47 | 368 | |
a6288483 | 369 | void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, |
50786d47 | 370 | uint8_t **ptrs, int neg_offset) |
a6288483 N |
371 | { |
372 | uint8_t *p, *q, *dp, *dq; | |
373 | uint8_t px, qx, db; | |
374 | const uint8_t *pbmul; /* P multiplier table for B data */ | |
375 | const uint8_t *qmul; /* Q multiplier table (for both) */ | |
376 | ||
eae01ef0 N |
377 | if (faila > failb) { |
378 | int t = faila; | |
379 | faila = failb; | |
380 | failb = t; | |
381 | } | |
382 | ||
50786d47 N |
383 | if (neg_offset) { |
384 | p = ptrs[-1]; | |
385 | q = ptrs[-2]; | |
386 | } else { | |
387 | p = ptrs[disks-2]; | |
388 | q = ptrs[disks-1]; | |
389 | } | |
a6288483 N |
390 | |
391 | /* Compute syndrome with zero for the missing data pages | |
392 | Use the dead data pages as temporary storage for | |
393 | delta p and delta q */ | |
394 | dp = ptrs[faila]; | |
395 | ptrs[faila] = zero; | |
396 | dq = ptrs[failb]; | |
397 | ptrs[failb] = zero; | |
398 | ||
399 | qsyndrome(dp, dq, ptrs, disks-2, bytes); | |
400 | ||
401 | /* Restore pointer table */ | |
402 | ptrs[faila] = dp; | |
403 | ptrs[failb] = dq; | |
404 | ||
405 | /* Now, pick the proper data tables */ | |
406 | pbmul = raid6_gfmul[raid6_gfexi[failb-faila]]; | |
407 | qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]]; | |
408 | ||
409 | /* Now do it... */ | |
410 | while ( bytes-- ) { | |
411 | px = *p ^ *dp; | |
412 | qx = qmul[*q ^ *dq]; | |
413 | *dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */ | |
414 | *dp++ = db ^ px; /* Reconstructed A */ | |
415 | p++; q++; | |
416 | } | |
417 | } | |
418 | ||
419 | /* Recover failure of one data block plus the P block */ | |
50786d47 N |
420 | void raid6_datap_recov(int disks, size_t bytes, int faila, uint8_t **ptrs, |
421 | int neg_offset) | |
a6288483 N |
422 | { |
423 | uint8_t *p, *q, *dq; | |
424 | const uint8_t *qmul; /* Q multiplier table */ | |
425 | ||
50786d47 N |
426 | if (neg_offset) { |
427 | p = ptrs[-1]; | |
428 | q = ptrs[-2]; | |
429 | } else { | |
430 | p = ptrs[disks-2]; | |
431 | q = ptrs[disks-1]; | |
432 | } | |
a6288483 N |
433 | |
434 | /* Compute syndrome with zero for the missing data page | |
435 | Use the dead data page as temporary storage for delta q */ | |
436 | dq = ptrs[faila]; | |
437 | ptrs[faila] = zero; | |
438 | ||
439 | qsyndrome(p, dq, ptrs, disks-2, bytes); | |
440 | ||
441 | /* Restore pointer table */ | |
442 | ptrs[faila] = dq; | |
443 | ||
444 | /* Now, pick the proper data tables */ | |
445 | qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]]; | |
446 | ||
447 | /* Now do it... */ | |
448 | while ( bytes-- ) { | |
449 | *p++ ^= *dq = qmul[*q ^ *dq]; | |
450 | q++; dq++; | |
451 | } | |
452 | } | |
453 | ||
9d0e7840 PS |
454 | /* Try to find out if a specific disk has a problem */ |
455 | int raid6_check_disks(int data_disks, int start, int chunk_size, | |
456 | int level, int layout, int diskP, int diskQ, | |
ef639064 | 457 | uint8_t *p, uint8_t *q, char **stripes) |
9d0e7840 PS |
458 | { |
459 | int i; | |
460 | int data_id, diskD; | |
461 | uint8_t Px, Qx; | |
462 | int curr_broken_disk = -1; | |
463 | int prev_broken_disk = -1; | |
464 | int broken_status = 0; | |
465 | ||
466 | for(i = 0; i < chunk_size; i++) { | |
467 | Px = (uint8_t)stripes[diskP][i] ^ (uint8_t)p[i]; | |
468 | Qx = (uint8_t)stripes[diskQ][i] ^ (uint8_t)q[i]; | |
469 | ||
470 | if((Px != 0) && (Qx == 0)) | |
471 | curr_broken_disk = diskP; | |
472 | ||
9d0e7840 PS |
473 | if((Px == 0) && (Qx != 0)) |
474 | curr_broken_disk = diskQ; | |
475 | ||
9d0e7840 | 476 | if((Px != 0) && (Qx != 0)) { |
c4db5301 PS |
477 | data_id = (raid6_gflog[Qx] - raid6_gflog[Px]); |
478 | if(data_id < 0) data_id += 255; | |
9d0e7840 PS |
479 | diskD = geo_map(data_id, start/chunk_size, |
480 | data_disks + 2, level, layout); | |
481 | curr_broken_disk = diskD; | |
482 | } | |
483 | ||
484 | if((Px == 0) && (Qx == 0)) | |
681b7ae2 | 485 | curr_broken_disk = prev_broken_disk; |
9d0e7840 | 486 | |
c4db5301 PS |
487 | if(curr_broken_disk >= data_disks + 2) |
488 | broken_status = 2; | |
489 | ||
9d0e7840 PS |
490 | switch(broken_status) { |
491 | case 0: | |
492 | if(curr_broken_disk != -1) { | |
493 | prev_broken_disk = curr_broken_disk; | |
494 | broken_status = 1; | |
495 | } | |
496 | break; | |
497 | ||
498 | case 1: | |
499 | if(curr_broken_disk != prev_broken_disk) | |
500 | broken_status = 2; | |
9d0e7840 PS |
501 | break; |
502 | ||
503 | case 2: | |
504 | default: | |
505 | curr_broken_disk = prev_broken_disk = -2; | |
506 | break; | |
507 | } | |
508 | } | |
509 | ||
510 | return curr_broken_disk; | |
511 | } | |
512 | ||
2fcb75ae AK |
513 | /******************************************************************************* |
514 | * Function: save_stripes | |
515 | * Description: | |
516 | * Function reads data (only data without P and Q) from array and writes | |
517 | * it to buf and opcjonaly to backup files | |
518 | * Parameters: | |
519 | * source : A list of 'fds' of the active disks. | |
520 | * Some may be absent | |
521 | * offsets : A list of offsets on disk belonging | |
522 | * to the array [bytes] | |
523 | * raid_disks : geometry: number of disks in the array | |
524 | * chunk_size : geometry: chunk size [bytes] | |
525 | * level : geometry: RAID level | |
526 | * layout : geometry: layout | |
527 | * nwrites : number of backup files | |
528 | * dest : A list of 'fds' for mirrored targets | |
529 | * (e.g. backup files). They are already seeked to right | |
530 | * (write) location. If NULL, data will be wrote | |
531 | * to the buf only | |
532 | * start : start address of data to read (must be stripe-aligned) | |
533 | * [bytes] | |
534 | * length - : length of data to read (must be stripe-aligned) | |
535 | * [bytes] | |
536 | * buf : buffer for data. It is large enough to hold | |
537 | * one stripe. It is stripe aligned | |
538 | * Returns: | |
539 | * 0 : success | |
540 | * -1 : fail | |
541 | ******************************************************************************/ | |
e86c9dd6 NB |
542 | int save_stripes(int *source, unsigned long long *offsets, |
543 | int raid_disks, int chunk_size, int level, int layout, | |
544 | int nwrites, int *dest, | |
a6288483 N |
545 | unsigned long long start, unsigned long long length, |
546 | char *buf) | |
e86c9dd6 | 547 | { |
e86c9dd6 NB |
548 | int len; |
549 | int data_disks = raid_disks - (level == 0 ? 0 : level <=5 ? 1 : 2); | |
550 | int disk; | |
a6288483 | 551 | int i; |
2fcb75ae | 552 | unsigned long long length_test; |
e86c9dd6 | 553 | |
a6288483 N |
554 | if (!tables_ready) |
555 | make_tables(); | |
59679536 | 556 | ensure_zero_has_size(chunk_size); |
a6288483 N |
557 | |
558 | len = data_disks * chunk_size; | |
2fcb75ae AK |
559 | length_test = length / len; |
560 | length_test *= len; | |
561 | ||
562 | if (length != length_test) { | |
563 | dprintf("Error: save_stripes(): Data are not alligned. EXIT\n"); | |
564 | dprintf("\tArea for saving stripes (length) = %llu\n", length); | |
565 | dprintf("\tWork step (len) = %i\n", len); | |
566 | dprintf("\tExpected save area (length_test) = %llu\n", | |
567 | length_test); | |
568 | abort(); | |
569 | } | |
570 | ||
e86c9dd6 | 571 | while (length > 0) { |
a6288483 N |
572 | int failed = 0; |
573 | int fdisk[3], fblock[3]; | |
574 | for (disk = 0; disk < raid_disks ; disk++) { | |
575 | unsigned long long offset; | |
576 | int dnum; | |
a6288483 N |
577 | |
578 | offset = (start/chunk_size/data_disks)*chunk_size; | |
579 | dnum = geo_map(disk < data_disks ? disk : data_disks - disk - 1, | |
580 | start/chunk_size/data_disks, | |
581 | raid_disks, level, layout); | |
7236ee7a | 582 | if (dnum < 0) abort(); |
a6288483 | 583 | if (source[dnum] < 0 || |
d16a7494 JS |
584 | lseek64(source[dnum], |
585 | offsets[dnum] + offset, 0) < 0 || | |
586 | read(source[dnum], buf+disk * chunk_size, | |
587 | chunk_size) != chunk_size) { | |
a6288483 N |
588 | if (failed <= 2) { |
589 | fdisk[failed] = dnum; | |
590 | fblock[failed] = disk; | |
591 | failed++; | |
592 | } | |
d16a7494 | 593 | } |
a6288483 N |
594 | } |
595 | if (failed == 0 || fblock[0] >= data_disks) | |
596 | /* all data disks are good */ | |
597 | ; | |
598 | else if (failed == 1 || fblock[1] >= data_disks+1) { | |
599 | /* one failed data disk and good parity */ | |
600 | char *bufs[data_disks]; | |
601 | for (i=0; i < data_disks; i++) | |
602 | if (fblock[0] == i) | |
603 | bufs[i] = buf + data_disks*chunk_size; | |
604 | else | |
605 | bufs[i] = buf + i*chunk_size; | |
606 | ||
607 | xor_blocks(buf + fblock[0]*chunk_size, | |
608 | bufs, data_disks, chunk_size); | |
609 | } else if (failed > 2 || level != 6) | |
610 | /* too much failure */ | |
e86c9dd6 | 611 | return -1; |
a6288483 N |
612 | else { |
613 | /* RAID6 computations needed. */ | |
614 | uint8_t *bufs[data_disks+4]; | |
615 | int qdisk; | |
616 | int syndrome_disks; | |
617 | disk = geo_map(-1, start/chunk_size/data_disks, | |
618 | raid_disks, level, layout); | |
619 | qdisk = geo_map(-2, start/chunk_size/data_disks, | |
620 | raid_disks, level, layout); | |
621 | if (is_ddf(layout)) { | |
622 | /* q over 'raid_disks' blocks, in device order. | |
623 | * 'p' and 'q' get to be all zero | |
624 | */ | |
625 | for (i = 0; i < raid_disks; i++) | |
cc50ccdc N |
626 | bufs[i] = zero; |
627 | for (i = 0; i < data_disks; i++) { | |
628 | int dnum = geo_map(i, | |
629 | start/chunk_size/data_disks, | |
630 | raid_disks, level, layout); | |
631 | int snum; | |
632 | /* i is the logical block number, so is index to 'buf'. | |
633 | * dnum is physical disk number | |
634 | * and thus the syndrome number. | |
635 | */ | |
636 | snum = dnum; | |
637 | bufs[snum] = (uint8_t*)buf + chunk_size * i; | |
638 | } | |
a6288483 N |
639 | syndrome_disks = raid_disks; |
640 | } else { | |
641 | /* for md, q is over 'data_disks' blocks, | |
642 | * starting immediately after 'q' | |
1eac9f84 N |
643 | * Note that for the '_6' variety, the p block |
644 | * makes a hole that we need to be careful of. | |
a6288483 | 645 | */ |
1eac9f84 N |
646 | int j; |
647 | int snum = 0; | |
648 | for (j = 0; j < raid_disks; j++) { | |
649 | int dnum = (qdisk + 1 + j) % raid_disks; | |
650 | if (dnum == disk || dnum == qdisk) | |
651 | continue; | |
652 | for (i = 0; i < data_disks; i++) | |
653 | if (geo_map(i, | |
654 | start/chunk_size/data_disks, | |
655 | raid_disks, level, layout) == dnum) | |
656 | break; | |
cc50ccdc N |
657 | /* i is the logical block number, so is index to 'buf'. |
658 | * dnum is physical disk number | |
659 | * snum is syndrome disk for which 0 is immediately after Q | |
660 | */ | |
cc50ccdc | 661 | bufs[snum] = (uint8_t*)buf + chunk_size * i; |
1eac9f84 N |
662 | |
663 | if (fblock[0] == i) | |
664 | fdisk[0] = snum; | |
665 | if (fblock[1] == i) | |
666 | fdisk[1] = snum; | |
667 | snum++; | |
cc50ccdc | 668 | } |
a6288483 | 669 | |
a6288483 N |
670 | syndrome_disks = data_disks; |
671 | } | |
cc50ccdc N |
672 | |
673 | /* Place P and Q blocks at end of bufs */ | |
674 | bufs[syndrome_disks] = (uint8_t*)buf + chunk_size * data_disks; | |
675 | bufs[syndrome_disks+1] = (uint8_t*)buf + chunk_size * (data_disks+1); | |
676 | ||
a6288483 N |
677 | if (fblock[1] == data_disks) |
678 | /* One data failed, and parity failed */ | |
679 | raid6_datap_recov(syndrome_disks+2, chunk_size, | |
50786d47 | 680 | fdisk[0], bufs, 0); |
cc50ccdc | 681 | else { |
a6288483 N |
682 | /* Two data blocks failed, P,Q OK */ |
683 | raid6_2data_recov(syndrome_disks+2, chunk_size, | |
50786d47 | 684 | fdisk[0], fdisk[1], bufs, 0); |
cc50ccdc | 685 | } |
a6288483 | 686 | } |
ccced3dc | 687 | if (dest) { |
2fcb75ae AK |
688 | for (i = 0; i < nwrites; i++) |
689 | if (write(dest[i], buf, len) != len) | |
690 | return -1; | |
ccced3dc AK |
691 | } else { |
692 | /* build next stripe in buffer */ | |
693 | buf += len; | |
694 | } | |
e86c9dd6 NB |
695 | length -= len; |
696 | start += len; | |
e86c9dd6 NB |
697 | } |
698 | return 0; | |
699 | } | |
700 | ||
701 | /* Restore data: | |
702 | * We are given: | |
703 | * A list of 'fds' of the active disks. Some may be '-1' for not-available. | |
353632d9 | 704 | * A geometry: raid_disks, chunk_size, level, layout |
e86c9dd6 NB |
705 | * An 'fd' to read from. It is already seeked to the right (Read) location. |
706 | * A start and length. | |
707 | * The length must be a multiple of the stripe size. | |
708 | * | |
709 | * We build a full stripe in memory and then write it out. | |
710 | * We assume that there are enough working devices. | |
711 | */ | |
712 | int restore_stripes(int *dest, unsigned long long *offsets, | |
713 | int raid_disks, int chunk_size, int level, int layout, | |
353632d9 | 714 | int source, unsigned long long read_offset, |
2fcb75ae AK |
715 | unsigned long long start, unsigned long long length, |
716 | char *src_buf) | |
e86c9dd6 | 717 | { |
e9e43ec3 | 718 | char *stripe_buf; |
503975b9 N |
719 | char **stripes = xmalloc(raid_disks * sizeof(char*)); |
720 | char **blocks = xmalloc(raid_disks * sizeof(char*)); | |
e86c9dd6 | 721 | int i; |
758be4f1 | 722 | int rv; |
e86c9dd6 | 723 | |
a6288483 | 724 | int data_disks = raid_disks - (level == 0 ? 0 : level <= 5 ? 1 : 2); |
e86c9dd6 | 725 | |
fcf57625 N |
726 | if (posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size)) |
727 | stripe_buf = NULL; | |
d47a2925 N |
728 | |
729 | if (zero == NULL || chunk_size > zero_size) { | |
730 | if (zero) | |
731 | free(zero); | |
503975b9 | 732 | zero = xcalloc(1, chunk_size); |
d47a2925 | 733 | zero_size = chunk_size; |
a6288483 | 734 | } |
d47a2925 | 735 | |
d7be7d87 JS |
736 | if (stripe_buf == NULL || stripes == NULL || blocks == NULL || |
737 | zero == NULL) { | |
758be4f1 LD |
738 | rv = -2; |
739 | goto abort; | |
e86c9dd6 | 740 | } |
2fcb75ae | 741 | for (i = 0; i < raid_disks; i++) |
e86c9dd6 NB |
742 | stripes[i] = stripe_buf + i * chunk_size; |
743 | while (length > 0) { | |
f21e18ca | 744 | unsigned int len = data_disks * chunk_size; |
e86c9dd6 | 745 | unsigned long long offset; |
48327135 | 746 | int disk, qdisk; |
a6288483 | 747 | int syndrome_disks; |
758be4f1 LD |
748 | if (length < len) { |
749 | rv = -3; | |
750 | goto abort; | |
751 | } | |
2fcb75ae | 752 | for (i = 0; i < data_disks; i++) { |
e86c9dd6 NB |
753 | int disk = geo_map(i, start/chunk_size/data_disks, |
754 | raid_disks, level, layout); | |
2fcb75ae AK |
755 | if (src_buf == NULL) { |
756 | /* read from file */ | |
758be4f1 LD |
757 | if (lseek64(source, read_offset, 0) != |
758 | (off64_t)read_offset) { | |
759 | rv = -1; | |
760 | goto abort; | |
761 | } | |
2fcb75ae AK |
762 | if (read(source, |
763 | stripes[disk], | |
758be4f1 LD |
764 | chunk_size) != chunk_size) { |
765 | rv = -1; | |
766 | goto abort; | |
767 | } | |
2fcb75ae AK |
768 | } else { |
769 | /* read from input buffer */ | |
770 | memcpy(stripes[disk], | |
771 | src_buf + read_offset, | |
772 | chunk_size); | |
773 | } | |
353632d9 | 774 | read_offset += chunk_size; |
e86c9dd6 NB |
775 | } |
776 | /* We have the data, now do the parity */ | |
777 | offset = (start/chunk_size/data_disks) * chunk_size; | |
48327135 NB |
778 | switch (level) { |
779 | case 4: | |
780 | case 5: | |
781 | disk = geo_map(-1, start/chunk_size/data_disks, | |
e86c9dd6 | 782 | raid_disks, level, layout); |
e0d95aac N |
783 | for (i = 0; i < data_disks; i++) |
784 | blocks[i] = stripes[(disk+1+i) % raid_disks]; | |
e86c9dd6 | 785 | xor_blocks(stripes[disk], blocks, data_disks, chunk_size); |
48327135 NB |
786 | break; |
787 | case 6: | |
788 | disk = geo_map(-1, start/chunk_size/data_disks, | |
789 | raid_disks, level, layout); | |
790 | qdisk = geo_map(-2, start/chunk_size/data_disks, | |
791 | raid_disks, level, layout); | |
e0d95aac N |
792 | if (is_ddf(layout)) { |
793 | /* q over 'raid_disks' blocks, in device order. | |
794 | * 'p' and 'q' get to be all zero | |
795 | */ | |
796 | for (i = 0; i < raid_disks; i++) | |
797 | if (i == disk || i == qdisk) | |
a6288483 | 798 | blocks[i] = (char*)zero; |
e0d95aac N |
799 | else |
800 | blocks[i] = stripes[i]; | |
a6288483 | 801 | syndrome_disks = raid_disks; |
e0d95aac | 802 | } else { |
a6288483 | 803 | /* for md, q is over 'data_disks' blocks, |
e0d95aac N |
804 | * starting immediately after 'q' |
805 | */ | |
806 | for (i = 0; i < data_disks; i++) | |
807 | blocks[i] = stripes[(qdisk+1+i) % raid_disks]; | |
48327135 | 808 | |
a6288483 | 809 | syndrome_disks = data_disks; |
e0d95aac | 810 | } |
a6288483 | 811 | qsyndrome((uint8_t*)stripes[disk], |
1011e834 | 812 | (uint8_t*)stripes[qdisk], |
a6288483 N |
813 | (uint8_t**)blocks, |
814 | syndrome_disks, chunk_size); | |
48327135 | 815 | break; |
e86c9dd6 NB |
816 | } |
817 | for (i=0; i < raid_disks ; i++) | |
818 | if (dest[i] >= 0) { | |
758be4f1 LD |
819 | if (lseek64(dest[i], |
820 | offsets[i]+offset, 0) < 0) { | |
821 | rv = -1; | |
822 | goto abort; | |
823 | } | |
824 | if (write(dest[i], stripes[i], | |
825 | chunk_size) != chunk_size) { | |
826 | rv = -1; | |
827 | goto abort; | |
828 | } | |
e86c9dd6 NB |
829 | } |
830 | length -= len; | |
831 | start += len; | |
832 | } | |
758be4f1 LD |
833 | rv = 0; |
834 | ||
835 | abort: | |
836 | free(stripe_buf); | |
837 | free(stripes); | |
838 | free(blocks); | |
839 | return rv; | |
e86c9dd6 NB |
840 | } |
841 | ||
842 | #ifdef MAIN | |
843 | ||
48327135 NB |
844 | int test_stripes(int *source, unsigned long long *offsets, |
845 | int raid_disks, int chunk_size, int level, int layout, | |
846 | unsigned long long start, unsigned long long length) | |
847 | { | |
848 | /* ready the data and p (and q) blocks, and check we got them right */ | |
503975b9 N |
849 | char *stripe_buf = xmalloc(raid_disks * chunk_size); |
850 | char **stripes = xmalloc(raid_disks * sizeof(char*)); | |
851 | char **blocks = xmalloc(raid_disks * sizeof(char*)); | |
ef639064 N |
852 | uint8_t *p = xmalloc(chunk_size); |
853 | uint8_t *q = xmalloc(chunk_size); | |
48327135 NB |
854 | |
855 | int i; | |
9d0e7840 | 856 | int diskP, diskQ; |
48327135 | 857 | int data_disks = raid_disks - (level == 5 ? 1: 2); |
9d0e7840 PS |
858 | |
859 | if (!tables_ready) | |
860 | make_tables(); | |
861 | ||
48327135 NB |
862 | for ( i = 0 ; i < raid_disks ; i++) |
863 | stripes[i] = stripe_buf + i * chunk_size; | |
864 | ||
865 | while (length > 0) { | |
866 | int disk; | |
867 | ||
868 | for (i = 0 ; i < raid_disks ; i++) { | |
869 | lseek64(source[i], offsets[i]+start, 0); | |
870 | read(source[i], stripes[i], chunk_size); | |
871 | } | |
872 | for (i = 0 ; i < data_disks ; i++) { | |
873 | int disk = geo_map(i, start/chunk_size, raid_disks, | |
874 | level, layout); | |
875 | blocks[i] = stripes[disk]; | |
876 | printf("%d->%d\n", i, disk); | |
877 | } | |
878 | switch(level) { | |
879 | case 6: | |
521f349c | 880 | qsyndrome(p, q, (uint8_t**)blocks, data_disks, chunk_size); |
9d0e7840 | 881 | diskP = geo_map(-1, start/chunk_size, raid_disks, |
48327135 | 882 | level, layout); |
9d0e7840 PS |
883 | if (memcmp(p, stripes[diskP], chunk_size) != 0) { |
884 | printf("P(%d) wrong at %llu\n", diskP, | |
48327135 NB |
885 | start / chunk_size); |
886 | } | |
9d0e7840 | 887 | diskQ = geo_map(-2, start/chunk_size, raid_disks, |
48327135 | 888 | level, layout); |
9d0e7840 PS |
889 | if (memcmp(q, stripes[diskQ], chunk_size) != 0) { |
890 | printf("Q(%d) wrong at %llu\n", diskQ, | |
48327135 NB |
891 | start / chunk_size); |
892 | } | |
9d0e7840 PS |
893 | disk = raid6_check_disks(data_disks, start, chunk_size, |
894 | level, layout, diskP, diskQ, | |
895 | p, q, stripes); | |
896 | if(disk >= 0) { | |
897 | printf("Possible failed disk: %d\n", disk); | |
898 | } | |
899 | if(disk == -2) { | |
900 | printf("Failure detected, but disk unknown\n"); | |
901 | } | |
48327135 NB |
902 | break; |
903 | } | |
904 | length -= chunk_size; | |
905 | start += chunk_size; | |
906 | } | |
907 | return 0; | |
908 | } | |
909 | ||
e86c9dd6 NB |
910 | unsigned long long getnum(char *str, char **err) |
911 | { | |
912 | char *e; | |
913 | unsigned long long rv = strtoull(str, &e, 10); | |
914 | if (e==str || *e) { | |
915 | *err = str; | |
916 | return 0; | |
917 | } | |
918 | return rv; | |
919 | } | |
920 | ||
42129b3f | 921 | char const Name[] = "test_restripe"; |
ad1a3c2f | 922 | int main(int argc, char *argv[]) |
e86c9dd6 NB |
923 | { |
924 | /* save/restore file raid_disks chunk_size level layout start length devices... | |
925 | */ | |
926 | int save; | |
927 | int *fds; | |
928 | char *file; | |
a6288483 | 929 | char *buf; |
e86c9dd6 NB |
930 | int storefd; |
931 | unsigned long long *offsets; | |
932 | int raid_disks, chunk_size, level, layout; | |
933 | unsigned long long start, length; | |
934 | int i; | |
935 | ||
936 | char *err = NULL; | |
937 | if (argc < 10) { | |
7a862a02 | 938 | fprintf(stderr, "Usage: test_stripe save/restore file raid_disks chunk_size level layout start length devices...\n"); |
e86c9dd6 NB |
939 | exit(1); |
940 | } | |
941 | if (strcmp(argv[1], "save")==0) | |
942 | save = 1; | |
943 | else if (strcmp(argv[1], "restore") == 0) | |
944 | save = 0; | |
48327135 NB |
945 | else if (strcmp(argv[1], "test") == 0) |
946 | save = 2; | |
e86c9dd6 NB |
947 | else { |
948 | fprintf(stderr, "test_stripe: must give 'save' or 'restore'.\n"); | |
949 | exit(2); | |
950 | } | |
951 | ||
952 | file = argv[2]; | |
953 | raid_disks = getnum(argv[3], &err); | |
954 | chunk_size = getnum(argv[4], &err); | |
955 | level = getnum(argv[5], &err); | |
956 | layout = getnum(argv[6], &err); | |
957 | start = getnum(argv[7], &err); | |
958 | length = getnum(argv[8], &err); | |
959 | if (err) { | |
960 | fprintf(stderr, "test_stripe: Bad number: %s\n", err); | |
961 | exit(2); | |
962 | } | |
963 | if (argc != raid_disks + 9) { | |
964 | fprintf(stderr, "test_stripe: wrong number of devices: want %d found %d\n", | |
965 | raid_disks, argc-9); | |
966 | exit(2); | |
967 | } | |
503975b9 N |
968 | fds = xmalloc(raid_disks * sizeof(*fds)); |
969 | offsets = xcalloc(raid_disks, sizeof(*offsets)); | |
e86c9dd6 NB |
970 | |
971 | storefd = open(file, O_RDWR); | |
972 | if (storefd < 0) { | |
973 | perror(file); | |
974 | fprintf(stderr, "test_stripe: could not open %s.\n", file); | |
975 | exit(3); | |
976 | } | |
977 | for (i=0; i<raid_disks; i++) { | |
6f38d7ae PS |
978 | char *p; |
979 | p = strchr(argv[9+i], ':'); | |
980 | ||
981 | if(p != NULL) { | |
982 | *p++ = '\0'; | |
983 | offsets[i] = atoll(p) * 512; | |
984 | } | |
1011e834 | 985 | |
e86c9dd6 NB |
986 | fds[i] = open(argv[9+i], O_RDWR); |
987 | if (fds[i] < 0) { | |
988 | perror(argv[9+i]); | |
989 | fprintf(stderr,"test_stripe: cannot open %s.\n", argv[9+i]); | |
990 | exit(3); | |
991 | } | |
992 | } | |
993 | ||
503975b9 | 994 | buf = xmalloc(raid_disks * chunk_size); |
a6288483 | 995 | |
48327135 | 996 | if (save == 1) { |
e86c9dd6 NB |
997 | int rv = save_stripes(fds, offsets, |
998 | raid_disks, chunk_size, level, layout, | |
999 | 1, &storefd, | |
a6288483 | 1000 | start, length, buf); |
e86c9dd6 | 1001 | if (rv != 0) { |
48327135 NB |
1002 | fprintf(stderr, |
1003 | "test_stripe: save_stripes returned %d\n", rv); | |
1004 | exit(1); | |
1005 | } | |
1006 | } else if (save == 2) { | |
1007 | int rv = test_stripes(fds, offsets, | |
1008 | raid_disks, chunk_size, level, layout, | |
1009 | start, length); | |
1010 | if (rv != 0) { | |
1011 | fprintf(stderr, | |
1012 | "test_stripe: test_stripes returned %d\n", rv); | |
e86c9dd6 NB |
1013 | exit(1); |
1014 | } | |
1015 | } else { | |
1016 | int rv = restore_stripes(fds, offsets, | |
1017 | raid_disks, chunk_size, level, layout, | |
353632d9 | 1018 | storefd, 0ULL, |
c071a1cd | 1019 | start, length, NULL); |
e86c9dd6 | 1020 | if (rv != 0) { |
48327135 NB |
1021 | fprintf(stderr, |
1022 | "test_stripe: restore_stripes returned %d\n", | |
1023 | rv); | |
e86c9dd6 NB |
1024 | exit(1); |
1025 | } | |
1026 | } | |
1027 | exit(0); | |
1028 | } | |
1029 | ||
1030 | #endif /* MAIN */ |