]>
Commit | Line | Data |
---|---|---|
e86c9dd6 NB |
1 | /* |
2 | * mdadm - manage Linux "md" devices aka RAID arrays. | |
3 | * | |
e736b623 | 4 | * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de> |
e86c9dd6 NB |
5 | * |
6 | * | |
7 | * This program is free software; you can redistribute it and/or modify | |
8 | * it under the terms of the GNU General Public License as published by | |
9 | * the Free Software Foundation; either version 2 of the License, or | |
10 | * (at your option) any later version. | |
11 | * | |
12 | * This program is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | * GNU General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public License | |
18 | * along with this program; if not, write to the Free Software | |
19 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
20 | * | |
21 | * Author: Neil Brown | |
22 | * Email: <neilb@suse.de> | |
23 | */ | |
24 | ||
25 | #include "mdadm.h" | |
a6288483 | 26 | #include <stdint.h> |
e86c9dd6 NB |
27 | |
28 | /* To restripe, we read from old geometry to a buffer, and | |
29 | * read from buffer to new geometry. | |
a6288483 N |
30 | * When reading, we might have missing devices and so could need |
31 | * to reconstruct. | |
32 | * When writing, we need to create correct parity and Q. | |
e86c9dd6 NB |
33 | * |
34 | */ | |
35 | ||
e0d95aac N |
36 | static int geo_map(int block, unsigned long long stripe, int raid_disks, |
37 | int level, int layout) | |
e86c9dd6 | 38 | { |
48327135 | 39 | /* On the given stripe, find which disk in the array will have |
e86c9dd6 | 40 | * block numbered 'block'. |
48327135 NB |
41 | * '-1' means the parity block. |
42 | * '-2' means the Q syndrome. | |
e86c9dd6 NB |
43 | */ |
44 | int pd; | |
45 | ||
46 | switch(level*100 + layout) { | |
47 | case 000: | |
48 | case 400: | |
e0d95aac | 49 | case 500 + ALGORITHM_PARITY_N: |
e86c9dd6 NB |
50 | /* raid 4 isn't messed around by parity blocks */ |
51 | if (block == -1) | |
52 | return raid_disks-1; /* parity block */ | |
53 | return block; | |
54 | case 500 + ALGORITHM_LEFT_ASYMMETRIC: | |
55 | pd = (raid_disks-1) - stripe % raid_disks; | |
56 | if (block == -1) return pd; | |
57 | if (block >= pd) | |
58 | block++; | |
59 | return block; | |
60 | ||
61 | case 500 + ALGORITHM_RIGHT_ASYMMETRIC: | |
62 | pd = stripe % raid_disks; | |
63 | if (block == -1) return pd; | |
64 | if (block >= pd) | |
65 | block++; | |
66 | return block; | |
67 | ||
68 | case 500 + ALGORITHM_LEFT_SYMMETRIC: | |
69 | pd = (raid_disks - 1) - stripe % raid_disks; | |
70 | if (block == -1) return pd; | |
71 | return (pd + 1 + block) % raid_disks; | |
72 | ||
73 | case 500 + ALGORITHM_RIGHT_SYMMETRIC: | |
74 | pd = stripe % raid_disks; | |
75 | if (block == -1) return pd; | |
76 | return (pd + 1 + block) % raid_disks; | |
77 | ||
e0d95aac N |
78 | case 500 + ALGORITHM_PARITY_0: |
79 | return block + 1; | |
80 | ||
81 | ||
82 | case 600 + ALGORITHM_PARITY_N_6: | |
83 | if (block == -2) | |
84 | return raid_disks - 1; | |
85 | if (block == -1) | |
86 | return raid_disks - 2; /* parity block */ | |
87 | return block; | |
88 | case 600 + ALGORITHM_LEFT_ASYMMETRIC_6: | |
89 | if (block == -2) | |
90 | return raid_disks - 1; | |
91 | raid_disks--; | |
92 | pd = (raid_disks-1) - stripe % raid_disks; | |
93 | if (block == -1) return pd; | |
94 | if (block >= pd) | |
95 | block++; | |
96 | return block; | |
97 | ||
98 | case 600 + ALGORITHM_RIGHT_ASYMMETRIC_6: | |
99 | if (block == -2) | |
100 | return raid_disks - 1; | |
101 | raid_disks--; | |
102 | pd = stripe % raid_disks; | |
103 | if (block == -1) return pd; | |
104 | if (block >= pd) | |
105 | block++; | |
106 | return block; | |
107 | ||
108 | case 600 + ALGORITHM_LEFT_SYMMETRIC_6: | |
109 | if (block == -2) | |
110 | return raid_disks - 1; | |
111 | raid_disks--; | |
112 | pd = (raid_disks - 1) - stripe % raid_disks; | |
113 | if (block == -1) return pd; | |
114 | return (pd + 1 + block) % raid_disks; | |
115 | ||
116 | case 600 + ALGORITHM_RIGHT_SYMMETRIC_6: | |
117 | if (block == -2) | |
118 | return raid_disks - 1; | |
119 | raid_disks--; | |
120 | pd = stripe % raid_disks; | |
121 | if (block == -1) return pd; | |
122 | return (pd + 1 + block) % raid_disks; | |
123 | ||
124 | case 600 + ALGORITHM_PARITY_0_6: | |
125 | if (block == -2) | |
126 | return raid_disks - 1; | |
127 | return block + 1; | |
128 | ||
129 | ||
130 | case 600 + ALGORITHM_PARITY_0: | |
131 | if (block == -1) | |
132 | return 0; | |
133 | if (block == -2) | |
134 | return 1; | |
135 | return block + 2; | |
136 | ||
e86c9dd6 NB |
137 | case 600 + ALGORITHM_LEFT_ASYMMETRIC: |
138 | pd = raid_disks - 1 - (stripe % raid_disks); | |
139 | if (block == -1) return pd; | |
48327135 | 140 | if (block == -2) return (pd+1) % raid_disks; |
e86c9dd6 NB |
141 | if (pd == raid_disks - 1) |
142 | return block+1; | |
143 | if (block >= pd) | |
144 | return block+2; | |
145 | return block; | |
146 | ||
e0d95aac N |
147 | case 600 + ALGORITHM_ROTATING_ZERO_RESTART: |
148 | /* Different order for calculating Q, otherwize same as ... */ | |
e86c9dd6 NB |
149 | case 600 + ALGORITHM_RIGHT_ASYMMETRIC: |
150 | pd = stripe % raid_disks; | |
151 | if (block == -1) return pd; | |
48327135 | 152 | if (block == -2) return (pd+1) % raid_disks; |
e86c9dd6 NB |
153 | if (pd == raid_disks - 1) |
154 | return block+1; | |
155 | if (block >= pd) | |
156 | return block+2; | |
157 | return block; | |
158 | ||
159 | case 600 + ALGORITHM_LEFT_SYMMETRIC: | |
160 | pd = raid_disks - 1 - (stripe % raid_disks); | |
161 | if (block == -1) return pd; | |
48327135 | 162 | if (block == -2) return (pd+1) % raid_disks; |
e86c9dd6 NB |
163 | return (pd + 2 + block) % raid_disks; |
164 | ||
165 | case 600 + ALGORITHM_RIGHT_SYMMETRIC: | |
166 | pd = stripe % raid_disks; | |
167 | if (block == -1) return pd; | |
48327135 | 168 | if (block == -2) return (pd+1) % raid_disks; |
e86c9dd6 | 169 | return (pd + 2 + block) % raid_disks; |
e0d95aac N |
170 | |
171 | ||
172 | case 600 + ALGORITHM_ROTATING_N_RESTART: | |
173 | /* Same a left_asymmetric, by first stripe is | |
174 | * D D D P Q rather than | |
175 | * Q D D D P | |
176 | */ | |
177 | pd = raid_disks - 1 - ((stripe + 1) % raid_disks); | |
178 | if (block == -1) return pd; | |
179 | if (block == -2) return (pd+1) % raid_disks; | |
180 | if (pd == raid_disks - 1) | |
181 | return block+1; | |
182 | if (block >= pd) | |
183 | return block+2; | |
184 | return block; | |
185 | ||
186 | case 600 + ALGORITHM_ROTATING_N_CONTINUE: | |
187 | /* Same as left_symmetric but Q is before P */ | |
188 | pd = raid_disks - 1 - (stripe % raid_disks); | |
189 | if (block == -1) return pd; | |
190 | if (block == -2) return (pd+raid_disks-1) % raid_disks; | |
191 | return (pd + 1 + block) % raid_disks; | |
e86c9dd6 NB |
192 | } |
193 | return -1; | |
194 | } | |
e0d95aac N |
195 | static int is_ddf(int layout) |
196 | { | |
197 | switch (layout) | |
198 | { | |
199 | default: | |
200 | return 0; | |
201 | case ALGORITHM_ROTATING_N_CONTINUE: | |
202 | case ALGORITHM_ROTATING_N_RESTART: | |
203 | case ALGORITHM_ROTATING_ZERO_RESTART: | |
204 | return 1; | |
205 | } | |
206 | } | |
e86c9dd6 NB |
207 | |
208 | ||
209 | static void xor_blocks(char *target, char **sources, int disks, int size) | |
210 | { | |
211 | int i, j; | |
212 | /* Amazingly inefficient... */ | |
213 | for (i=0; i<size; i++) { | |
214 | char c = 0; | |
215 | for (j=0 ; j<disks; j++) | |
216 | c ^= sources[j][i]; | |
217 | target[i] = c; | |
218 | } | |
219 | } | |
220 | ||
a6288483 | 221 | static void qsyndrome(uint8_t *p, uint8_t *q, uint8_t **sources, int disks, int size) |
48327135 NB |
222 | { |
223 | int d, z; | |
a6288483 | 224 | uint8_t wq0, wp0, wd0, w10, w20; |
48327135 NB |
225 | for ( d = 0; d < size; d++) { |
226 | wq0 = wp0 = sources[disks-1][d]; | |
227 | for ( z = disks-2 ; z >= 0 ; z-- ) { | |
228 | wd0 = sources[z][d]; | |
229 | wp0 ^= wd0; | |
230 | w20 = (wq0&0x80) ? 0xff : 0x00; | |
231 | w10 = (wq0 << 1) & 0xff; | |
232 | w20 &= 0x1d; | |
233 | w10 ^= w20; | |
234 | wq0 = w10 ^ wd0; | |
235 | } | |
236 | p[d] = wp0; | |
237 | q[d] = wq0; | |
238 | } | |
239 | } | |
240 | ||
a6288483 N |
241 | |
242 | /* | |
243 | * The following was taken from linux/drivers/md/mktables.c, and modified | |
244 | * to create in-memory tables rather than C code | |
245 | */ | |
246 | static uint8_t gfmul(uint8_t a, uint8_t b) | |
247 | { | |
248 | uint8_t v = 0; | |
249 | ||
250 | while (b) { | |
251 | if (b & 1) | |
252 | v ^= a; | |
253 | a = (a << 1) ^ (a & 0x80 ? 0x1d : 0); | |
254 | b >>= 1; | |
255 | } | |
256 | ||
257 | return v; | |
258 | } | |
259 | ||
260 | static uint8_t gfpow(uint8_t a, int b) | |
261 | { | |
262 | uint8_t v = 1; | |
263 | ||
264 | b %= 255; | |
265 | if (b < 0) | |
266 | b += 255; | |
267 | ||
268 | while (b) { | |
269 | if (b & 1) | |
270 | v = gfmul(v, a); | |
271 | a = gfmul(a, a); | |
272 | b >>= 1; | |
273 | } | |
274 | ||
275 | return v; | |
276 | } | |
277 | ||
278 | int tables_ready = 0; | |
279 | uint8_t raid6_gfmul[256][256]; | |
280 | uint8_t raid6_gfexp[256]; | |
281 | uint8_t raid6_gfinv[256]; | |
282 | uint8_t raid6_gfexi[256]; | |
283 | void make_tables(void) | |
284 | { | |
285 | int i, j; | |
286 | uint8_t v; | |
287 | ||
288 | /* Compute multiplication table */ | |
289 | for (i = 0; i < 256; i++) | |
290 | for (j = 0; j < 256; j++) | |
291 | raid6_gfmul[i][j] = gfmul(i, j); | |
292 | ||
293 | /* Compute power-of-2 table (exponent) */ | |
294 | v = 1; | |
295 | for (i = 0; i < 256; i++) { | |
296 | raid6_gfexp[i] = v; | |
297 | v = gfmul(v, 2); | |
298 | if (v == 1) | |
299 | v = 0; /* For entry 255, not a real entry */ | |
300 | } | |
301 | ||
302 | /* Compute inverse table x^-1 == x^254 */ | |
303 | for (i = 0; i < 256; i++) | |
304 | raid6_gfinv[i] = gfpow(i, 254); | |
305 | ||
306 | /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ | |
307 | for (i = 0; i < 256; i ++) | |
308 | raid6_gfexi[i] = raid6_gfinv[raid6_gfexp[i] ^ 1]; | |
309 | ||
310 | tables_ready = 1; | |
311 | } | |
312 | ||
313 | uint8_t *zero; | |
314 | /* Following was taken from linux/drivers/md/raid6recov.c */ | |
315 | ||
316 | /* Recover two failed data blocks. */ | |
317 | void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, | |
318 | uint8_t **ptrs) | |
319 | { | |
320 | uint8_t *p, *q, *dp, *dq; | |
321 | uint8_t px, qx, db; | |
322 | const uint8_t *pbmul; /* P multiplier table for B data */ | |
323 | const uint8_t *qmul; /* Q multiplier table (for both) */ | |
324 | ||
325 | p = ptrs[disks-2]; | |
326 | q = ptrs[disks-1]; | |
327 | ||
328 | /* Compute syndrome with zero for the missing data pages | |
329 | Use the dead data pages as temporary storage for | |
330 | delta p and delta q */ | |
331 | dp = ptrs[faila]; | |
332 | ptrs[faila] = zero; | |
333 | dq = ptrs[failb]; | |
334 | ptrs[failb] = zero; | |
335 | ||
336 | qsyndrome(dp, dq, ptrs, disks-2, bytes); | |
337 | ||
338 | /* Restore pointer table */ | |
339 | ptrs[faila] = dp; | |
340 | ptrs[failb] = dq; | |
341 | ||
342 | /* Now, pick the proper data tables */ | |
343 | pbmul = raid6_gfmul[raid6_gfexi[failb-faila]]; | |
344 | qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]]; | |
345 | ||
346 | /* Now do it... */ | |
347 | while ( bytes-- ) { | |
348 | px = *p ^ *dp; | |
349 | qx = qmul[*q ^ *dq]; | |
350 | *dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */ | |
351 | *dp++ = db ^ px; /* Reconstructed A */ | |
352 | p++; q++; | |
353 | } | |
354 | } | |
355 | ||
356 | /* Recover failure of one data block plus the P block */ | |
357 | void raid6_datap_recov(int disks, size_t bytes, int faila, uint8_t **ptrs) | |
358 | { | |
359 | uint8_t *p, *q, *dq; | |
360 | const uint8_t *qmul; /* Q multiplier table */ | |
361 | ||
362 | p = ptrs[disks-2]; | |
363 | q = ptrs[disks-1]; | |
364 | ||
365 | /* Compute syndrome with zero for the missing data page | |
366 | Use the dead data page as temporary storage for delta q */ | |
367 | dq = ptrs[faila]; | |
368 | ptrs[faila] = zero; | |
369 | ||
370 | qsyndrome(p, dq, ptrs, disks-2, bytes); | |
371 | ||
372 | /* Restore pointer table */ | |
373 | ptrs[faila] = dq; | |
374 | ||
375 | /* Now, pick the proper data tables */ | |
376 | qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]]; | |
377 | ||
378 | /* Now do it... */ | |
379 | while ( bytes-- ) { | |
380 | *p++ ^= *dq = qmul[*q ^ *dq]; | |
381 | q++; dq++; | |
382 | } | |
383 | } | |
384 | ||
e86c9dd6 NB |
385 | /* Save data: |
386 | * We are given: | |
a6288483 | 387 | * A list of 'fds' of the active disks. Some may be absent. |
48327135 | 388 | * A geometry: raid_disks, chunk_size, level, layout |
e86c9dd6 NB |
389 | * A list of 'fds' for mirrored targets. They are already seeked to |
390 | * right (Write) location | |
a6288483 N |
391 | * A start and length which must be stripe-aligned |
392 | * 'buf' is large enough to hold one stripe, and is aligned | |
e86c9dd6 NB |
393 | */ |
394 | ||
395 | int save_stripes(int *source, unsigned long long *offsets, | |
396 | int raid_disks, int chunk_size, int level, int layout, | |
397 | int nwrites, int *dest, | |
a6288483 N |
398 | unsigned long long start, unsigned long long length, |
399 | char *buf) | |
e86c9dd6 | 400 | { |
e86c9dd6 NB |
401 | int len; |
402 | int data_disks = raid_disks - (level == 0 ? 0 : level <=5 ? 1 : 2); | |
403 | int disk; | |
a6288483 | 404 | int i; |
e86c9dd6 | 405 | |
a6288483 N |
406 | if (!tables_ready) |
407 | make_tables(); | |
408 | ||
409 | if (zero == NULL) { | |
410 | zero = malloc(chunk_size); | |
411 | memset(zero, 0, chunk_size); | |
412 | } | |
413 | ||
414 | len = data_disks * chunk_size; | |
e86c9dd6 | 415 | while (length > 0) { |
a6288483 N |
416 | int failed = 0; |
417 | int fdisk[3], fblock[3]; | |
418 | for (disk = 0; disk < raid_disks ; disk++) { | |
419 | unsigned long long offset; | |
420 | int dnum; | |
a6288483 N |
421 | |
422 | offset = (start/chunk_size/data_disks)*chunk_size; | |
423 | dnum = geo_map(disk < data_disks ? disk : data_disks - disk - 1, | |
424 | start/chunk_size/data_disks, | |
425 | raid_disks, level, layout); | |
7236ee7a | 426 | if (dnum < 0) abort(); |
a6288483 | 427 | if (source[dnum] < 0 || |
cc50ccdc | 428 | lseek64(source[dnum], offsets[dnum]+offset, 0) < 0 || |
7236ee7a N |
429 | read(source[dnum], buf+disk * chunk_size, chunk_size) |
430 | != chunk_size) | |
a6288483 N |
431 | if (failed <= 2) { |
432 | fdisk[failed] = dnum; | |
433 | fblock[failed] = disk; | |
434 | failed++; | |
435 | } | |
436 | } | |
437 | if (failed == 0 || fblock[0] >= data_disks) | |
438 | /* all data disks are good */ | |
439 | ; | |
440 | else if (failed == 1 || fblock[1] >= data_disks+1) { | |
441 | /* one failed data disk and good parity */ | |
442 | char *bufs[data_disks]; | |
443 | for (i=0; i < data_disks; i++) | |
444 | if (fblock[0] == i) | |
445 | bufs[i] = buf + data_disks*chunk_size; | |
446 | else | |
447 | bufs[i] = buf + i*chunk_size; | |
448 | ||
449 | xor_blocks(buf + fblock[0]*chunk_size, | |
450 | bufs, data_disks, chunk_size); | |
451 | } else if (failed > 2 || level != 6) | |
452 | /* too much failure */ | |
e86c9dd6 | 453 | return -1; |
a6288483 N |
454 | else { |
455 | /* RAID6 computations needed. */ | |
456 | uint8_t *bufs[data_disks+4]; | |
457 | int qdisk; | |
458 | int syndrome_disks; | |
459 | disk = geo_map(-1, start/chunk_size/data_disks, | |
460 | raid_disks, level, layout); | |
461 | qdisk = geo_map(-2, start/chunk_size/data_disks, | |
462 | raid_disks, level, layout); | |
463 | if (is_ddf(layout)) { | |
464 | /* q over 'raid_disks' blocks, in device order. | |
465 | * 'p' and 'q' get to be all zero | |
466 | */ | |
467 | for (i = 0; i < raid_disks; i++) | |
cc50ccdc N |
468 | bufs[i] = zero; |
469 | for (i = 0; i < data_disks; i++) { | |
470 | int dnum = geo_map(i, | |
471 | start/chunk_size/data_disks, | |
472 | raid_disks, level, layout); | |
473 | int snum; | |
474 | /* i is the logical block number, so is index to 'buf'. | |
475 | * dnum is physical disk number | |
476 | * and thus the syndrome number. | |
477 | */ | |
478 | snum = dnum; | |
479 | bufs[snum] = (uint8_t*)buf + chunk_size * i; | |
480 | } | |
a6288483 N |
481 | syndrome_disks = raid_disks; |
482 | } else { | |
483 | /* for md, q is over 'data_disks' blocks, | |
484 | * starting immediately after 'q' | |
1eac9f84 N |
485 | * Note that for the '_6' variety, the p block |
486 | * makes a hole that we need to be careful of. | |
a6288483 | 487 | */ |
1eac9f84 N |
488 | int j; |
489 | int snum = 0; | |
490 | for (j = 0; j < raid_disks; j++) { | |
491 | int dnum = (qdisk + 1 + j) % raid_disks; | |
492 | if (dnum == disk || dnum == qdisk) | |
493 | continue; | |
494 | for (i = 0; i < data_disks; i++) | |
495 | if (geo_map(i, | |
496 | start/chunk_size/data_disks, | |
497 | raid_disks, level, layout) == dnum) | |
498 | break; | |
cc50ccdc N |
499 | /* i is the logical block number, so is index to 'buf'. |
500 | * dnum is physical disk number | |
501 | * snum is syndrome disk for which 0 is immediately after Q | |
502 | */ | |
cc50ccdc | 503 | bufs[snum] = (uint8_t*)buf + chunk_size * i; |
1eac9f84 N |
504 | |
505 | if (fblock[0] == i) | |
506 | fdisk[0] = snum; | |
507 | if (fblock[1] == i) | |
508 | fdisk[1] = snum; | |
509 | snum++; | |
cc50ccdc | 510 | } |
a6288483 | 511 | |
a6288483 N |
512 | syndrome_disks = data_disks; |
513 | } | |
cc50ccdc N |
514 | |
515 | /* Place P and Q blocks at end of bufs */ | |
516 | bufs[syndrome_disks] = (uint8_t*)buf + chunk_size * data_disks; | |
517 | bufs[syndrome_disks+1] = (uint8_t*)buf + chunk_size * (data_disks+1); | |
518 | ||
a6288483 N |
519 | if (fblock[1] == data_disks) |
520 | /* One data failed, and parity failed */ | |
521 | raid6_datap_recov(syndrome_disks+2, chunk_size, | |
522 | fdisk[0], bufs); | |
cc50ccdc N |
523 | else { |
524 | if (fdisk[0] > fdisk[1]) { | |
525 | int t = fdisk[0]; | |
526 | fdisk[0] = fdisk[1]; | |
527 | fdisk[1] = t; | |
528 | } | |
a6288483 N |
529 | /* Two data blocks failed, P,Q OK */ |
530 | raid6_2data_recov(syndrome_disks+2, chunk_size, | |
531 | fdisk[0], fdisk[1], bufs); | |
cc50ccdc | 532 | } |
a6288483 N |
533 | } |
534 | ||
e86c9dd6 NB |
535 | for (i=0; i<nwrites; i++) |
536 | if (write(dest[i], buf, len) != len) | |
537 | return -1; | |
a6288483 | 538 | |
e86c9dd6 NB |
539 | length -= len; |
540 | start += len; | |
e86c9dd6 NB |
541 | } |
542 | return 0; | |
543 | } | |
544 | ||
545 | /* Restore data: | |
546 | * We are given: | |
547 | * A list of 'fds' of the active disks. Some may be '-1' for not-available. | |
353632d9 | 548 | * A geometry: raid_disks, chunk_size, level, layout |
e86c9dd6 NB |
549 | * An 'fd' to read from. It is already seeked to the right (Read) location. |
550 | * A start and length. | |
551 | * The length must be a multiple of the stripe size. | |
552 | * | |
553 | * We build a full stripe in memory and then write it out. | |
554 | * We assume that there are enough working devices. | |
555 | */ | |
556 | int restore_stripes(int *dest, unsigned long long *offsets, | |
557 | int raid_disks, int chunk_size, int level, int layout, | |
353632d9 | 558 | int source, unsigned long long read_offset, |
e86c9dd6 NB |
559 | unsigned long long start, unsigned long long length) |
560 | { | |
e9e43ec3 | 561 | char *stripe_buf; |
e86c9dd6 NB |
562 | char **stripes = malloc(raid_disks * sizeof(char*)); |
563 | char **blocks = malloc(raid_disks * sizeof(char*)); | |
564 | int i; | |
565 | ||
a6288483 | 566 | int data_disks = raid_disks - (level == 0 ? 0 : level <= 5 ? 1 : 2); |
e86c9dd6 | 567 | |
e9e43ec3 | 568 | posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size); |
a6288483 N |
569 | if (zero == NULL) { |
570 | zero = malloc(chunk_size); | |
571 | if (zero) | |
572 | memset(zero, 0, chunk_size); | |
573 | } | |
e0d95aac N |
574 | if (stripe_buf == NULL || stripes == NULL || blocks == NULL |
575 | || zero == NULL) { | |
e86c9dd6 NB |
576 | free(stripe_buf); |
577 | free(stripes); | |
578 | free(blocks); | |
e0d95aac | 579 | free(zero); |
e86c9dd6 NB |
580 | return -2; |
581 | } | |
582 | for (i=0; i<raid_disks; i++) | |
583 | stripes[i] = stripe_buf + i * chunk_size; | |
584 | while (length > 0) { | |
585 | int len = data_disks * chunk_size; | |
586 | unsigned long long offset; | |
48327135 | 587 | int disk, qdisk; |
a6288483 | 588 | int syndrome_disks; |
e86c9dd6 NB |
589 | if (length < len) |
590 | return -3; | |
591 | for (i=0; i < data_disks; i++) { | |
592 | int disk = geo_map(i, start/chunk_size/data_disks, | |
593 | raid_disks, level, layout); | |
353632d9 NB |
594 | if (lseek64(source, read_offset, 0) != read_offset) |
595 | return -1; | |
e86c9dd6 NB |
596 | if (read(source, stripes[disk], chunk_size) != chunk_size) |
597 | return -1; | |
353632d9 | 598 | read_offset += chunk_size; |
e86c9dd6 NB |
599 | } |
600 | /* We have the data, now do the parity */ | |
601 | offset = (start/chunk_size/data_disks) * chunk_size; | |
48327135 NB |
602 | switch (level) { |
603 | case 4: | |
604 | case 5: | |
605 | disk = geo_map(-1, start/chunk_size/data_disks, | |
e86c9dd6 | 606 | raid_disks, level, layout); |
e0d95aac N |
607 | for (i = 0; i < data_disks; i++) |
608 | blocks[i] = stripes[(disk+1+i) % raid_disks]; | |
e86c9dd6 | 609 | xor_blocks(stripes[disk], blocks, data_disks, chunk_size); |
48327135 NB |
610 | break; |
611 | case 6: | |
612 | disk = geo_map(-1, start/chunk_size/data_disks, | |
613 | raid_disks, level, layout); | |
614 | qdisk = geo_map(-2, start/chunk_size/data_disks, | |
615 | raid_disks, level, layout); | |
e0d95aac N |
616 | if (is_ddf(layout)) { |
617 | /* q over 'raid_disks' blocks, in device order. | |
618 | * 'p' and 'q' get to be all zero | |
619 | */ | |
620 | for (i = 0; i < raid_disks; i++) | |
621 | if (i == disk || i == qdisk) | |
a6288483 | 622 | blocks[i] = (char*)zero; |
e0d95aac N |
623 | else |
624 | blocks[i] = stripes[i]; | |
a6288483 | 625 | syndrome_disks = raid_disks; |
e0d95aac | 626 | } else { |
a6288483 | 627 | /* for md, q is over 'data_disks' blocks, |
e0d95aac N |
628 | * starting immediately after 'q' |
629 | */ | |
630 | for (i = 0; i < data_disks; i++) | |
631 | blocks[i] = stripes[(qdisk+1+i) % raid_disks]; | |
48327135 | 632 | |
a6288483 | 633 | syndrome_disks = data_disks; |
e0d95aac | 634 | } |
a6288483 N |
635 | qsyndrome((uint8_t*)stripes[disk], |
636 | (uint8_t*)stripes[qdisk], | |
637 | (uint8_t**)blocks, | |
638 | syndrome_disks, chunk_size); | |
48327135 | 639 | break; |
e86c9dd6 NB |
640 | } |
641 | for (i=0; i < raid_disks ; i++) | |
642 | if (dest[i] >= 0) { | |
643 | if (lseek64(dest[i], offsets[i]+offset, 0) < 0) | |
644 | return -1; | |
645 | if (write(dest[i], stripes[i], chunk_size) != chunk_size) | |
646 | return -1; | |
647 | } | |
648 | length -= len; | |
649 | start += len; | |
650 | } | |
651 | return 0; | |
652 | } | |
653 | ||
654 | #ifdef MAIN | |
655 | ||
48327135 NB |
656 | int test_stripes(int *source, unsigned long long *offsets, |
657 | int raid_disks, int chunk_size, int level, int layout, | |
658 | unsigned long long start, unsigned long long length) | |
659 | { | |
660 | /* ready the data and p (and q) blocks, and check we got them right */ | |
661 | char *stripe_buf = malloc(raid_disks * chunk_size); | |
662 | char **stripes = malloc(raid_disks * sizeof(char*)); | |
663 | char **blocks = malloc(raid_disks * sizeof(char*)); | |
664 | char *p = malloc(chunk_size); | |
665 | char *q = malloc(chunk_size); | |
666 | ||
667 | int i; | |
668 | int data_disks = raid_disks - (level == 5 ? 1: 2); | |
669 | for ( i = 0 ; i < raid_disks ; i++) | |
670 | stripes[i] = stripe_buf + i * chunk_size; | |
671 | ||
672 | while (length > 0) { | |
673 | int disk; | |
674 | ||
675 | for (i = 0 ; i < raid_disks ; i++) { | |
676 | lseek64(source[i], offsets[i]+start, 0); | |
677 | read(source[i], stripes[i], chunk_size); | |
678 | } | |
679 | for (i = 0 ; i < data_disks ; i++) { | |
680 | int disk = geo_map(i, start/chunk_size, raid_disks, | |
681 | level, layout); | |
682 | blocks[i] = stripes[disk]; | |
683 | printf("%d->%d\n", i, disk); | |
684 | } | |
685 | switch(level) { | |
686 | case 6: | |
521f349c | 687 | qsyndrome(p, q, (uint8_t**)blocks, data_disks, chunk_size); |
48327135 NB |
688 | disk = geo_map(-1, start/chunk_size, raid_disks, |
689 | level, layout); | |
690 | if (memcmp(p, stripes[disk], chunk_size) != 0) { | |
691 | printf("P(%d) wrong at %llu\n", disk, | |
692 | start / chunk_size); | |
693 | } | |
694 | disk = geo_map(-2, start/chunk_size, raid_disks, | |
695 | level, layout); | |
696 | if (memcmp(q, stripes[disk], chunk_size) != 0) { | |
697 | printf("Q(%d) wrong at %llu\n", disk, | |
698 | start / chunk_size); | |
699 | } | |
700 | break; | |
701 | } | |
702 | length -= chunk_size; | |
703 | start += chunk_size; | |
704 | } | |
705 | return 0; | |
706 | } | |
707 | ||
e86c9dd6 NB |
708 | unsigned long long getnum(char *str, char **err) |
709 | { | |
710 | char *e; | |
711 | unsigned long long rv = strtoull(str, &e, 10); | |
712 | if (e==str || *e) { | |
713 | *err = str; | |
714 | return 0; | |
715 | } | |
716 | return rv; | |
717 | } | |
718 | ||
719 | main(int argc, char *argv[]) | |
720 | { | |
721 | /* save/restore file raid_disks chunk_size level layout start length devices... | |
722 | */ | |
723 | int save; | |
724 | int *fds; | |
725 | char *file; | |
a6288483 | 726 | char *buf; |
e86c9dd6 NB |
727 | int storefd; |
728 | unsigned long long *offsets; | |
729 | int raid_disks, chunk_size, level, layout; | |
730 | unsigned long long start, length; | |
731 | int i; | |
732 | ||
733 | char *err = NULL; | |
734 | if (argc < 10) { | |
735 | fprintf(stderr, "Usage: test_stripe save/restore file raid_disks" | |
736 | " chunk_size level layout start length devices...\n"); | |
737 | exit(1); | |
738 | } | |
739 | if (strcmp(argv[1], "save")==0) | |
740 | save = 1; | |
741 | else if (strcmp(argv[1], "restore") == 0) | |
742 | save = 0; | |
48327135 NB |
743 | else if (strcmp(argv[1], "test") == 0) |
744 | save = 2; | |
e86c9dd6 NB |
745 | else { |
746 | fprintf(stderr, "test_stripe: must give 'save' or 'restore'.\n"); | |
747 | exit(2); | |
748 | } | |
749 | ||
750 | file = argv[2]; | |
751 | raid_disks = getnum(argv[3], &err); | |
752 | chunk_size = getnum(argv[4], &err); | |
753 | level = getnum(argv[5], &err); | |
754 | layout = getnum(argv[6], &err); | |
755 | start = getnum(argv[7], &err); | |
756 | length = getnum(argv[8], &err); | |
757 | if (err) { | |
758 | fprintf(stderr, "test_stripe: Bad number: %s\n", err); | |
759 | exit(2); | |
760 | } | |
761 | if (argc != raid_disks + 9) { | |
762 | fprintf(stderr, "test_stripe: wrong number of devices: want %d found %d\n", | |
763 | raid_disks, argc-9); | |
764 | exit(2); | |
765 | } | |
766 | fds = malloc(raid_disks * sizeof(*fds)); | |
767 | offsets = malloc(raid_disks * sizeof(*offsets)); | |
768 | memset(offsets, 0, raid_disks * sizeof(*offsets)); | |
769 | ||
770 | storefd = open(file, O_RDWR); | |
771 | if (storefd < 0) { | |
772 | perror(file); | |
773 | fprintf(stderr, "test_stripe: could not open %s.\n", file); | |
774 | exit(3); | |
775 | } | |
776 | for (i=0; i<raid_disks; i++) { | |
777 | fds[i] = open(argv[9+i], O_RDWR); | |
778 | if (fds[i] < 0) { | |
779 | perror(argv[9+i]); | |
780 | fprintf(stderr,"test_stripe: cannot open %s.\n", argv[9+i]); | |
781 | exit(3); | |
782 | } | |
783 | } | |
784 | ||
a6288483 N |
785 | buf = malloc(raid_disks * chunk_size); |
786 | ||
48327135 | 787 | if (save == 1) { |
e86c9dd6 NB |
788 | int rv = save_stripes(fds, offsets, |
789 | raid_disks, chunk_size, level, layout, | |
790 | 1, &storefd, | |
a6288483 | 791 | start, length, buf); |
e86c9dd6 | 792 | if (rv != 0) { |
48327135 NB |
793 | fprintf(stderr, |
794 | "test_stripe: save_stripes returned %d\n", rv); | |
795 | exit(1); | |
796 | } | |
797 | } else if (save == 2) { | |
798 | int rv = test_stripes(fds, offsets, | |
799 | raid_disks, chunk_size, level, layout, | |
800 | start, length); | |
801 | if (rv != 0) { | |
802 | fprintf(stderr, | |
803 | "test_stripe: test_stripes returned %d\n", rv); | |
e86c9dd6 NB |
804 | exit(1); |
805 | } | |
806 | } else { | |
807 | int rv = restore_stripes(fds, offsets, | |
808 | raid_disks, chunk_size, level, layout, | |
353632d9 | 809 | storefd, 0ULL, |
e86c9dd6 NB |
810 | start, length); |
811 | if (rv != 0) { | |
48327135 NB |
812 | fprintf(stderr, |
813 | "test_stripe: restore_stripes returned %d\n", | |
814 | rv); | |
e86c9dd6 NB |
815 | exit(1); |
816 | } | |
817 | } | |
818 | exit(0); | |
819 | } | |
820 | ||
821 | #endif /* MAIN */ |