--- /dev/null
+From d0dabf7e577411c2bf6b616c751544dc241213d4 Mon Sep 17 00:00:00 2001
+From: NeilBrown <neilb@suse.de>
+Date: Tue, 31 Mar 2009 14:39:38 +1100
+Subject: [PATCH] md/raid6: remove expectation that Q device is immediately after P device.
+
+Code currently assumes that the devices in a raid6 stripe are
+ 0 1 ... N-1 P Q
+in some rotated order. We will shortly add new layouts in which
+this strict pattern is broken.
+So remove this expectation. We still assume that the data disks
+are roughly in-order. However P and Q can be inserted anywhere within
+that order.
+
+Signed-off-by: NeilBrown <neilb@suse.de>
+---
+ drivers/md/raid5.c | 213 ++++++++++++++++++++++++++-------------------
+ include/linux/raid/raid5.h | 15 +--
+ 2 files changed, 133 insertions(+), 95 deletions(-)
+
+--- linux-2.6.27-SLE11_BRANCH.orig/drivers/md/raid5.c
++++ linux-2.6.27-SLE11_BRANCH/drivers/md/raid5.c
+@@ -135,12 +135,36 @@ static inline void raid5_set_bi_hw_segme
+ bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16);
+ }
+
++/* Find first data disk in a raid6 stripe */
++static inline int raid6_d0(struct stripe_head *sh)
++{
++ if (sh->qd_idx == sh->disks - 1)
++ return 0;
++ else
++ return sh->qd_idx + 1;
++}
+ static inline int raid6_next_disk(int disk, int raid_disks)
+ {
+ disk++;
+ return (disk < raid_disks) ? disk : 0;
+ }
+
++/* When walking through the disks in a raid5, starting at raid6_d0,
++ * We need to map each disk to a 'slot', where the data disks are slot
++ * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
++ * is raid_disks-1. This help does that mapping.
++ */
++static int raid6_idx_to_slot(int idx, struct stripe_head *sh, int *count)
++{
++ int slot;
++ if (idx == sh->pd_idx)
++ return sh->disks - 2;
++ if (idx == sh->qd_idx)
++ return sh->disks - 1;
++ slot = (*count)++;
++ return slot;
++}
++
+ static void return_io(struct bio *return_bi)
+ {
+ struct bio *bi = return_bi;
+@@ -198,6 +222,7 @@ static void __release_stripe(raid5_conf_
+ }
+ }
+ }
++
+ static void release_stripe(struct stripe_head *sh)
+ {
+ raid5_conf_t *conf = sh->raid_conf;
+@@ -276,12 +301,14 @@ static int grow_buffers(struct stripe_he
+ }
+
+ static void raid5_build_block (struct stripe_head *sh, int i);
+-static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int previous);
++static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int previous,
++ int *qd_idx);
+
+ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
+ {
+ raid5_conf_t *conf = sh->raid_conf;
+ int i;
++ int qd_idx;
+
+ BUG_ON(atomic_read(&sh->count) != 0);
+ BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
+@@ -295,7 +322,8 @@ static void init_stripe(struct stripe_he
+
+ sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
+ sh->sector = sector;
+- sh->pd_idx = stripe_to_pdidx(sector, conf, previous);
++ sh->pd_idx = stripe_to_pdidx(sector, conf, previous, &qd_idx);
++ sh->qd_idx = qd_idx;
+ sh->state = 0;
+
+
+@@ -1237,7 +1265,7 @@ static void error(mddev_t *mddev, mdk_rd
+ */
+ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
+ int previous,
+- int *dd_idx, int *pd_idx)
++ int *dd_idx, int *pd_idx, int *qd_idx)
+ {
+ long stripe;
+ unsigned long chunk_number;
+@@ -1270,6 +1298,7 @@ static sector_t raid5_compute_sector(rai
+ /*
+ * Select the parity disk based on the user selected algorithm.
+ */
++ *qd_idx = ~0;
+ switch(conf->level) {
+ case 4:
+ *pd_idx = data_disks;
+@@ -1305,24 +1334,30 @@ static sector_t raid5_compute_sector(rai
+ switch (conf->algorithm) {
+ case ALGORITHM_LEFT_ASYMMETRIC:
+ *pd_idx = raid_disks - 1 - (stripe % raid_disks);
+- if (*pd_idx == raid_disks-1)
++ *qd_idx = *pd_idx + 1;
++ if (*pd_idx == raid_disks-1) {
+ (*dd_idx)++; /* Q D D D P */
+- else if (*dd_idx >= *pd_idx)
++ *qd_idx = 0;
++ } else if (*dd_idx >= *pd_idx)
+ (*dd_idx) += 2; /* D D P Q D */
+ break;
+ case ALGORITHM_RIGHT_ASYMMETRIC:
+ *pd_idx = stripe % raid_disks;
+- if (*pd_idx == raid_disks-1)
++ *qd_idx = *pd_idx + 1;
++ if (*pd_idx == raid_disks-1) {
+ (*dd_idx)++; /* Q D D D P */
+- else if (*dd_idx >= *pd_idx)
++ *qd_idx = 0;
++ } else if (*dd_idx >= *pd_idx)
+ (*dd_idx) += 2; /* D D P Q D */
+ break;
+ case ALGORITHM_LEFT_SYMMETRIC:
+ *pd_idx = raid_disks - 1 - (stripe % raid_disks);
++ *qd_idx = (*pd_idx + 1) % raid_disks;
+ *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
+ break;
+ case ALGORITHM_RIGHT_SYMMETRIC:
+ *pd_idx = stripe % raid_disks;
++ *qd_idx = (*pd_idx + 1) % raid_disks;
+ *dd_idx = (*pd_idx + 2 + *dd_idx) % raid_disks;
+ break;
+ default:
+@@ -1349,7 +1384,7 @@ static sector_t compute_blocknr(struct s
+ int sectors_per_chunk = conf->chunk_size >> 9;
+ sector_t stripe;
+ int chunk_offset;
+- int chunk_number, dummy1, dummy2, dd_idx = i;
++ int chunk_number, dummy1, dummy2, dummy3, dd_idx = i;
+ sector_t r_sector;
+
+
+@@ -1380,7 +1415,7 @@ static sector_t compute_blocknr(struct s
+ }
+ break;
+ case 6:
+- if (i == raid6_next_disk(sh->pd_idx, raid_disks))
++ if (i == sh->qd_idx)
+ return 0; /* It is the Q disk */
+ switch (conf->algorithm) {
+ case ALGORITHM_LEFT_ASYMMETRIC:
+@@ -1413,7 +1448,7 @@ static sector_t compute_blocknr(struct s
+
+ check = raid5_compute_sector (conf, r_sector,
+ (raid_disks != conf->raid_disks),
+- &dummy1, &dummy2);
++ &dummy1, &dummy2, &dummy3);
+ if (check != sh->sector || dummy1 != dd_idx || dummy2 != sh->pd_idx) {
+ printk(KERN_ERR "compute_blocknr: map not correct\n");
+ return 0;
+@@ -1482,13 +1517,14 @@ static void copy_data(int frombio, struc
+ static void compute_parity6(struct stripe_head *sh, int method)
+ {
+ raid6_conf_t *conf = sh->raid_conf;
+- int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
++ int i, pd_idx, qd_idx, d0_idx, disks = sh->disks, count;
+ struct bio *chosen;
+ /**** FIX THIS: This could be very bad if disks is close to 256 ****/
+ void *ptrs[disks];
+
+- qd_idx = raid6_next_disk(pd_idx, disks);
+- d0_idx = raid6_next_disk(qd_idx, disks);
++ pd_idx = sh->pd_idx;
++ qd_idx = sh->qd_idx;
++ d0_idx = raid6_d0(sh);
+
+ pr_debug("compute_parity, stripe %llu, method %d\n",
+ (unsigned long long)sh->sector, method);
+@@ -1526,22 +1562,22 @@ static void compute_parity6(struct strip
+ set_bit(R5_UPTODATE, &sh->dev[i].flags);
+ }
+
+-// switch(method) {
+-// case RECONSTRUCT_WRITE:
+-// case CHECK_PARITY:
+-// case UPDATE_PARITY:
+- /* Note that unlike RAID-5, the ordering of the disks matters greatly. */
+- /* FIX: Is this ordering of drives even remotely optimal? */
+- count = 0;
+- i = d0_idx;
+- do {
+- ptrs[count++] = page_address(sh->dev[i].page);
+- if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
+- printk("block %d/%d not uptodate on parity calc\n", i,count);
+- i = raid6_next_disk(i, disks);
+- } while ( i != d0_idx );
+-// break;
+-// }
++ /* Note that unlike RAID-5, the ordering of the disks matters greatly.*/
++ /* FIX: Is this ordering of drives even remotely optimal? */
++ count = 0;
++ i = d0_idx;
++ do {
++ int slot = raid6_idx_to_slot(i, sh, &count);
++ ptrs[slot] = page_address(sh->dev[i].page);
++ if (slot < sh->disks - 2 &&
++ !test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
++ printk(KERN_ERR "block %d/%d not uptodate "
++ "on parity calc\n", i, count);
++ BUG();
++ }
++ i = raid6_next_disk(i, disks);
++ } while (i != d0_idx);
++ BUG_ON(count+2 != disks);
+
+ raid6_call.gen_syndrome(disks, STRIPE_SIZE, ptrs);
+
+@@ -1565,8 +1601,7 @@ static void compute_block_1(struct strip
+ {
+ int i, count, disks = sh->disks;
+ void *ptr[MAX_XOR_BLOCKS], *dest, *p;
+- int pd_idx = sh->pd_idx;
+- int qd_idx = raid6_next_disk(pd_idx, disks);
++ int qd_idx = sh->qd_idx;
+
+ pr_debug("compute_block_1, stripe %llu, idx %d\n",
+ (unsigned long long)sh->sector, dd_idx);
+@@ -1602,21 +1637,31 @@ static void compute_block_1(struct strip
+ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2)
+ {
+ int i, count, disks = sh->disks;
+- int pd_idx = sh->pd_idx;
+- int qd_idx = raid6_next_disk(pd_idx, disks);
+- int d0_idx = raid6_next_disk(qd_idx, disks);
+- int faila, failb;
+-
+- /* faila and failb are disk numbers relative to d0_idx */
+- /* pd_idx become disks-2 and qd_idx become disks-1 */
+- faila = (dd_idx1 < d0_idx) ? dd_idx1+(disks-d0_idx) : dd_idx1-d0_idx;
+- failb = (dd_idx2 < d0_idx) ? dd_idx2+(disks-d0_idx) : dd_idx2-d0_idx;
++ int d0_idx = raid6_d0(sh);
++ int faila = -1, failb = -1;
++ /**** FIX THIS: This could be very bad if disks is close to 256 ****/
++ void *ptrs[disks];
++
++ count = 0;
++ i = d0_idx;
++ do {
++ int slot;
++ slot = raid6_idx_to_slot(i, sh, &count);
++ ptrs[slot] = page_address(sh->dev[i].page);
++ if (i == dd_idx1)
++ faila = slot;
++ if (i == dd_idx2)
++ failb = slot;
++ i = raid6_next_disk(i, disks);
++ } while (i != d0_idx);
++ BUG_ON(count+2 != disks);
+
+ BUG_ON(faila == failb);
+ if ( failb < faila ) { int tmp = faila; faila = failb; failb = tmp; }
+
+ pr_debug("compute_block_2, stripe %llu, idx %d,%d (%d,%d)\n",
+- (unsigned long long)sh->sector, dd_idx1, dd_idx2, faila, failb);
++ (unsigned long long)sh->sector, dd_idx1, dd_idx2,
++ faila, failb);
+
+ if ( failb == disks-1 ) {
+ /* Q disk is one of the missing disks */
+@@ -1626,39 +1671,26 @@ static void compute_block_2(struct strip
+ return;
+ } else {
+ /* We're missing D+Q; recompute D from P */
+- compute_block_1(sh, (dd_idx1 == qd_idx) ? dd_idx2 : dd_idx1, 0);
++ compute_block_1(sh, ((dd_idx1 == sh->qd_idx) ?
++ dd_idx2 : dd_idx1),
++ 0);
+ compute_parity6(sh, UPDATE_PARITY); /* Is this necessary? */
+ return;
+ }
+ }
+
+- /* We're missing D+P or D+D; build pointer table */
+- {
+- /**** FIX THIS: This could be very bad if disks is close to 256 ****/
+- void *ptrs[disks];
+-
+- count = 0;
+- i = d0_idx;
+- do {
+- ptrs[count++] = page_address(sh->dev[i].page);
+- i = raid6_next_disk(i, disks);
+- if (i != dd_idx1 && i != dd_idx2 &&
+- !test_bit(R5_UPTODATE, &sh->dev[i].flags))
+- printk("compute_2 with missing block %d/%d\n", count, i);
+- } while ( i != d0_idx );
+-
+- if ( failb == disks-2 ) {
+- /* We're missing D+P. */
+- raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs);
+- } else {
+- /* We're missing D+D. */
+- raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs);
+- }
+-
+- /* Both the above update both missing blocks */
+- set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
+- set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
++ /* We're missing D+P or D+D; */
++ if (failb == disks-2) {
++ /* We're missing D+P. */
++ raid6_datap_recov(disks, STRIPE_SIZE, faila, ptrs);
++ } else {
++ /* We're missing D+D. */
++ raid6_2data_recov(disks, STRIPE_SIZE, faila, failb, ptrs);
+ }
++
++ /* Both the above update both missing blocks */
++ set_bit(R5_UPTODATE, &sh->dev[dd_idx1].flags);
++ set_bit(R5_UPTODATE, &sh->dev[dd_idx2].flags);
+ }
+
+ static void
+@@ -1813,7 +1845,8 @@ static int page_is_zero(struct page *p)
+ memcmp(a, a+4, STRIPE_SIZE-4)==0);
+ }
+
+-static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int previous)
++static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int previous,
++ int *qd_idxp)
+ {
+ int sectors_per_chunk = conf->chunk_size >> 9;
+ int pd_idx, dd_idx;
+@@ -1824,7 +1857,7 @@ static int stripe_to_pdidx(sector_t stri
+ stripe * (disks - conf->max_degraded)
+ *sectors_per_chunk + chunk_offset,
+ previous,
+- &dd_idx, &pd_idx);
++ &dd_idx, &pd_idx, qd_idxp);
+ return pd_idx;
+ }
+
+@@ -2483,12 +2516,13 @@ static void handle_stripe_expansion(raid
+ clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+ for (i = 0; i < sh->disks; i++)
+ if (i != sh->pd_idx && (!r6s || i != r6s->qd_idx)) {
+- int dd_idx, pd_idx, j;
++ int dd_idx, pd_idx, qd_idx, j;
+ struct stripe_head *sh2;
+
+ sector_t bn = compute_blocknr(sh, i);
+- sector_t s = raid5_compute_sector(conf, bn, 0,
+- &dd_idx, &pd_idx);
++ sector_t s =
++ raid5_compute_sector(conf, bn, 0,
++ &dd_idx, &pd_idx, &qd_idx);
+ sh2 = get_active_stripe(conf, s, 0, 1);
+ if (sh2 == NULL)
+ /* so far only the early blocks of this stripe
+@@ -2512,8 +2546,7 @@ static void handle_stripe_expansion(raid
+ set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
+ for (j = 0; j < conf->raid_disks; j++)
+ if (j != sh2->pd_idx &&
+- (!r6s || j != raid6_next_disk(sh2->pd_idx,
+- sh2->disks)) &&
++ (!r6s || j != sh2->qd_idx) &&
+ !test_bit(R5_Expanded, &sh2->dev[j].flags))
+ break;
+ if (j == conf->raid_disks) {
+@@ -2773,9 +2806,11 @@ static bool handle_stripe5(struct stripe
+
+ if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
+ !sh->reconstruct_state) {
++ int qd_idx;
+ /* Need to write out all blocks after computing parity */
+ sh->disks = conf->raid_disks;
+- sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 0);
++ sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 0, &qd_idx);
++ sh->qd_idx = qd_idx;
+ schedule_reconstruction5(sh, &s, 1, 1);
+ } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
+ clear_bit(STRIPE_EXPAND_READY, &sh->state);
+@@ -2816,7 +2851,7 @@ static bool handle_stripe6(struct stripe
+ struct r5dev *dev, *pdev, *qdev;
+ mdk_rdev_t *blocked_rdev = NULL;
+
+- r6s.qd_idx = raid6_next_disk(pd_idx, disks);
++ r6s.qd_idx = sh->qd_idx;
+ pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
+ "pd_idx=%d, qd_idx=%d\n",
+ (unsigned long long)sh->sector, sh->state,
+@@ -2992,8 +3027,10 @@ static bool handle_stripe6(struct stripe
+
+ if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+ /* Need to write out all blocks after computing P&Q */
++ int qd_idx;
+ sh->disks = conf->raid_disks;
+- sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 0);
++ sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 0, &qd_idx);
++ sh->qd_idx = qd_idx;
+ compute_parity6(sh, RECONSTRUCT_WRITE);
+ for (i = conf->raid_disks ; i-- ; ) {
+ set_bit(R5_LOCKED, &sh->dev[i].flags);
+@@ -3265,7 +3302,7 @@ static int chunk_aligned_read(struct req
+ {
+ mddev_t *mddev = q->queuedata;
+ raid5_conf_t *conf = mddev_to_conf(mddev);
+- unsigned int dd_idx, pd_idx;
++ unsigned int dd_idx, pd_idx, qd_idx;
+ struct bio* align_bi;
+ mdk_rdev_t *rdev;
+
+@@ -3290,7 +3327,7 @@ static int chunk_aligned_read(struct req
+ */
+ align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector,
+ 0,
+- &dd_idx, &pd_idx);
++ &dd_idx, &pd_idx, &qd_idx);
+
+ rcu_read_lock();
+ rdev = rcu_dereference(conf->disks[dd_idx].rdev);
+@@ -3382,7 +3419,7 @@ static int make_request(struct request_q
+ {
+ mddev_t *mddev = q->queuedata;
+ raid5_conf_t *conf = mddev_to_conf(mddev);
+- unsigned int dd_idx, pd_idx;
++ int dd_idx, pd_idx, qd_idx;
+ sector_t new_sector;
+ sector_t logical_sector, last_sector;
+ struct stripe_head *sh;
+@@ -3446,7 +3483,7 @@ static int make_request(struct request_q
+
+ new_sector = raid5_compute_sector(conf, logical_sector,
+ previous,
+- &dd_idx, &pd_idx);
++ &dd_idx, &pd_idx, &qd_idx);
+ pr_debug("raid5: make_request, sector %llu logical %llu\n",
+ (unsigned long long)new_sector,
+ (unsigned long long)logical_sector);
+@@ -3535,7 +3572,7 @@ static sector_t reshape_request(mddev_t
+ */
+ raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
+ struct stripe_head *sh;
+- int pd_idx;
++ int pd_idx, qd_idx;
+ sector_t first_sector, last_sector;
+ int raid_disks = conf->previous_raid_disks;
+ int data_disks = raid_disks - conf->max_degraded;
+@@ -3598,7 +3635,7 @@ static sector_t reshape_request(mddev_t
+ if (j == sh->pd_idx)
+ continue;
+ if (conf->level == 6 &&
+- j == raid6_next_disk(sh->pd_idx, sh->disks))
++ j == sh->qd_idx)
+ continue;
+ s = compute_blocknr(sh, j);
+ if (s < mddev->array_sectors) {
+@@ -3625,11 +3662,11 @@ static sector_t reshape_request(mddev_t
+ */
+ first_sector =
+ raid5_compute_sector(conf, sector_nr*(new_data_disks),
+- 1, &dd_idx, &pd_idx);
++ 1, &dd_idx, &pd_idx, &qd_idx);
+ last_sector =
+ raid5_compute_sector(conf, ((sector_nr+conf->chunk_size/512)
+ *(new_data_disks) - 1),
+- 1, &dd_idx, &pd_idx);
++ 1, &dd_idx, &pd_idx, &qd_idx);
+ if (last_sector >= (mddev->size<<1))
+ last_sector = (mddev->size<<1)-1;
+ while (first_sector <= last_sector) {
+@@ -3764,7 +3801,7 @@ static int retry_aligned_read(raid5_con
+ * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
+ */
+ struct stripe_head *sh;
+- int dd_idx, pd_idx;
++ int dd_idx, pd_idx, qd_idx;
+ sector_t sector, logical_sector, last_sector;
+ int scnt = 0;
+ int remaining;
+@@ -3772,7 +3809,7 @@ static int retry_aligned_read(raid5_con
+
+ logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+ sector = raid5_compute_sector(conf, logical_sector,
+- 0, &dd_idx, &pd_idx);
++ 0, &dd_idx, &pd_idx, &qd_idx);
+ last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
+
+ for (; logical_sector < last_sector;
+--- linux-2.6.27-SLE11_BRANCH.orig/include/linux/raid/raid5.h
++++ linux-2.6.27-SLE11_BRANCH/include/linux/raid/raid5.h
+@@ -197,15 +197,16 @@ enum reconstruct_states {
+
+ struct stripe_head {
+ struct hlist_node hash;
+- struct list_head lru; /* inactive_list or handle_list */
+- struct raid5_private_data *raid_conf;
+- sector_t sector; /* sector of this row */
+- int pd_idx; /* parity disk index */
+- unsigned long state; /* state flags */
+- atomic_t count; /* nr of active thread/requests */
++ struct list_head lru; /* inactive_list or handle_list */
++ struct raid5_private_data *raid_conf;
++ sector_t sector; /* sector of this row */
++ short pd_idx; /* parity disk index */
++ short qd_idx; /* 'Q' disk index for raid6 */
++ unsigned long state; /* state flags */
++ atomic_t count; /* nr of active thread/requests */
+ spinlock_t lock;
+ int bm_seq; /* sequence number for bitmap flushes */
+- int disks; /* disks in stripe */
++ int disks; /* disks in stripe */
+ enum check_states check_state;
+ enum reconstruct_states reconstruct_state;
+ /* stripe_operations