From c49a99844848aa3580b6f002a086c637aac8c0c6 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 21 Jul 2023 09:05:48 +0200 Subject: [PATCH] 4.19-stable patches added patches: fs-dlm-return-positive-pid-value-for-f_getlk.patch md-raid0-add-discard-support-for-the-original-layout.patch --- ...eturn-positive-pid-value-for-f_getlk.patch | 36 ++++ ...card-support-for-the-original-layout.patch | 203 ++++++++++++++++++ queue-4.19/series | 2 + 3 files changed, 241 insertions(+) create mode 100644 queue-4.19/fs-dlm-return-positive-pid-value-for-f_getlk.patch create mode 100644 queue-4.19/md-raid0-add-discard-support-for-the-original-layout.patch diff --git a/queue-4.19/fs-dlm-return-positive-pid-value-for-f_getlk.patch b/queue-4.19/fs-dlm-return-positive-pid-value-for-f_getlk.patch new file mode 100644 index 00000000000..deaa99d2a2f --- /dev/null +++ b/queue-4.19/fs-dlm-return-positive-pid-value-for-f_getlk.patch @@ -0,0 +1,36 @@ +From 92655fbda5c05950a411eaabc19e025e86e2a291 Mon Sep 17 00:00:00 2001 +From: Alexander Aring +Date: Fri, 19 May 2023 11:21:24 -0400 +Subject: fs: dlm: return positive pid value for F_GETLK + +From: Alexander Aring + +commit 92655fbda5c05950a411eaabc19e025e86e2a291 upstream. + +The GETLK pid values have all been negated since commit 9d5b86ac13c5 +("fs/locks: Remove fl_nspid and use fs-specific l_pid for remote locks"). +Revert this for local pids, and leave in place negative pids for remote +owners. + +Cc: stable@vger.kernel.org +Fixes: 9d5b86ac13c5 ("fs/locks: Remove fl_nspid and use fs-specific l_pid for remote locks") +Signed-off-by: Alexander Aring +Signed-off-by: David Teigland +Signed-off-by: Greg Kroah-Hartman +--- + fs/dlm/plock.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/fs/dlm/plock.c ++++ b/fs/dlm/plock.c +@@ -366,7 +366,9 @@ int dlm_posix_get(dlm_lockspace_t *locks + locks_init_lock(fl); + fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK; + fl->fl_flags = FL_POSIX; +- fl->fl_pid = -op->info.pid; ++ fl->fl_pid = op->info.pid; ++ if (op->info.nodeid != dlm_our_nodeid()) ++ fl->fl_pid = -fl->fl_pid; + fl->fl_start = op->info.start; + fl->fl_end = op->info.end; + rv = 0; diff --git a/queue-4.19/md-raid0-add-discard-support-for-the-original-layout.patch b/queue-4.19/md-raid0-add-discard-support-for-the-original-layout.patch new file mode 100644 index 00000000000..8f1c4a01d52 --- /dev/null +++ b/queue-4.19/md-raid0-add-discard-support-for-the-original-layout.patch @@ -0,0 +1,203 @@ +From e836007089ba8fdf24e636ef2b007651fb4582e6 Mon Sep 17 00:00:00 2001 +From: Jason Baron +Date: Fri, 23 Jun 2023 14:05:23 -0400 +Subject: md/raid0: add discard support for the 'original' layout + +From: Jason Baron + +commit e836007089ba8fdf24e636ef2b007651fb4582e6 upstream. + +We've found that using raid0 with the 'original' layout and discard +enabled with different disk sizes (such that at least two zones are +created) can result in data corruption. This is due to the fact that +the discard handling in 'raid0_handle_discard()' assumes the 'alternate' +layout. We've seen this corruption using ext4 but other filesystems are +likely susceptible as well. + +More specifically, while multiple zones are necessary to create the +corruption, the corruption may not occur with multiple zones if they +layout in such a way the layout matches what the 'alternate' layout +would have produced. Thus, not all raid0 devices with the 'original' +layout, different size disks and discard enabled will encounter this +corruption. + +The 3.14 kernel inadvertently changed the raid0 disk layout for different +size disks. Thus, running a pre-3.14 kernel and post-3.14 kernel on the +same raid0 array could corrupt data. This lead to the creation of the +'original' layout (to match the pre-3.14 layout) and the 'alternate' layout +(to match the post 3.14 layout) in the 5.4 kernel time frame and an option +to tell the kernel which layout to use (since it couldn't be autodetected). +However, when the 'original' layout was added back to 5.4 discard support +for the 'original' layout was not added leading this issue. + +I've been able to reliably reproduce the corruption with the following +test case: + +1. create raid0 array with different size disks using original layout +2. mkfs +3. mount -o discard +4. create lots of files +5. remove 1/2 the files +6. fstrim -a (or just the mount point for the raid0 array) +7. umount +8. fsck -fn /dev/md0 (spews all sorts of corruptions) + +Let's fix this by adding proper discard support to the 'original' layout. +The fix 'maps' the 'original' layout disks to the order in which they are +read/written such that we can compare the disks in the same way that the +current 'alternate' layout does. A 'disk_shift' field is added to +'struct strip_zone'. This could be computed on the fly in +raid0_handle_discard() but by adding this field, we save some computation +in the discard path. + +Note we could also potentially fix this by re-ordering the disks in the +zones that follow the first one, and then always read/writing them using +the 'alternate' layout. However, that is seen as a more substantial change, +and we are attempting the least invasive fix at this time to remedy the +corruption. + +I've verified the change using the reproducer mentioned above. Typically, +the corruption is seen after less than 3 iterations, while the patch has +run 500+ iterations. + +Cc: NeilBrown +Cc: Song Liu +Fixes: c84a1372df92 ("md/raid0: avoid RAID0 data corruption due to layout confusion.") +Cc: stable@vger.kernel.org +Signed-off-by: Jason Baron +Signed-off-by: Song Liu +Link: https://lore.kernel.org/r/20230623180523.1901230-1-jbaron@akamai.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/raid0.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++------- + drivers/md/raid0.h | 1 + 2 files changed, 55 insertions(+), 8 deletions(-) + +--- a/drivers/md/raid0.c ++++ b/drivers/md/raid0.c +@@ -296,6 +296,18 @@ static int create_strip_zones(struct mdd + goto abort; + } + ++ if (conf->layout == RAID0_ORIG_LAYOUT) { ++ for (i = 1; i < conf->nr_strip_zones; i++) { ++ sector_t first_sector = conf->strip_zone[i-1].zone_end; ++ ++ sector_div(first_sector, mddev->chunk_sectors); ++ zone = conf->strip_zone + i; ++ /* disk_shift is first disk index used in the zone */ ++ zone->disk_shift = sector_div(first_sector, ++ zone->nb_dev); ++ } ++ } ++ + pr_debug("md/raid0:%s: done.\n", mdname(mddev)); + *private_conf = conf; + +@@ -482,6 +494,20 @@ static inline int is_io_in_chunk_boundar + } + } + ++/* ++ * Convert disk_index to the disk order in which it is read/written. ++ * For example, if we have 4 disks, they are numbered 0,1,2,3. If we ++ * write the disks starting at disk 3, then the read/write order would ++ * be disk 3, then 0, then 1, and then disk 2 and we want map_disk_shift() ++ * to map the disks as follows 0,1,2,3 => 1,2,3,0. So disk 0 would map ++ * to 1, 1 to 2, 2 to 3, and 3 to 0. That way we can compare disks in ++ * that 'output' space to understand the read/write disk ordering. ++ */ ++static int map_disk_shift(int disk_index, int num_disks, int disk_shift) ++{ ++ return ((disk_index + num_disks - disk_shift) % num_disks); ++} ++ + static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) + { + struct r0conf *conf = mddev->private; +@@ -495,7 +521,9 @@ static void raid0_handle_discard(struct + sector_t end_disk_offset; + unsigned int end_disk_index; + unsigned int disk; ++ sector_t orig_start, orig_end; + ++ orig_start = start; + zone = find_zone(conf, &start); + + if (bio_end_sector(bio) > zone->zone_end) { +@@ -509,6 +537,7 @@ static void raid0_handle_discard(struct + } else + end = bio_end_sector(bio); + ++ orig_end = end; + if (zone != conf->strip_zone) + end = end - zone[-1].zone_end; + +@@ -520,13 +549,26 @@ static void raid0_handle_discard(struct + last_stripe_index = end; + sector_div(last_stripe_index, stripe_size); + +- start_disk_index = (int)(start - first_stripe_index * stripe_size) / +- mddev->chunk_sectors; ++ /* In the first zone the original and alternate layouts are the same */ ++ if ((conf->layout == RAID0_ORIG_LAYOUT) && (zone != conf->strip_zone)) { ++ sector_div(orig_start, mddev->chunk_sectors); ++ start_disk_index = sector_div(orig_start, zone->nb_dev); ++ start_disk_index = map_disk_shift(start_disk_index, ++ zone->nb_dev, ++ zone->disk_shift); ++ sector_div(orig_end, mddev->chunk_sectors); ++ end_disk_index = sector_div(orig_end, zone->nb_dev); ++ end_disk_index = map_disk_shift(end_disk_index, ++ zone->nb_dev, zone->disk_shift); ++ } else { ++ start_disk_index = (int)(start - first_stripe_index * stripe_size) / ++ mddev->chunk_sectors; ++ end_disk_index = (int)(end - last_stripe_index * stripe_size) / ++ mddev->chunk_sectors; ++ } + start_disk_offset = ((int)(start - first_stripe_index * stripe_size) % + mddev->chunk_sectors) + + first_stripe_index * mddev->chunk_sectors; +- end_disk_index = (int)(end - last_stripe_index * stripe_size) / +- mddev->chunk_sectors; + end_disk_offset = ((int)(end - last_stripe_index * stripe_size) % + mddev->chunk_sectors) + + last_stripe_index * mddev->chunk_sectors; +@@ -535,18 +577,22 @@ static void raid0_handle_discard(struct + sector_t dev_start, dev_end; + struct bio *discard_bio = NULL; + struct md_rdev *rdev; ++ int compare_disk; ++ ++ compare_disk = map_disk_shift(disk, zone->nb_dev, ++ zone->disk_shift); + +- if (disk < start_disk_index) ++ if (compare_disk < start_disk_index) + dev_start = (first_stripe_index + 1) * + mddev->chunk_sectors; +- else if (disk > start_disk_index) ++ else if (compare_disk > start_disk_index) + dev_start = first_stripe_index * mddev->chunk_sectors; + else + dev_start = start_disk_offset; + +- if (disk < end_disk_index) ++ if (compare_disk < end_disk_index) + dev_end = (last_stripe_index + 1) * mddev->chunk_sectors; +- else if (disk > end_disk_index) ++ else if (compare_disk > end_disk_index) + dev_end = last_stripe_index * mddev->chunk_sectors; + else + dev_end = end_disk_offset; +--- a/drivers/md/raid0.h ++++ b/drivers/md/raid0.h +@@ -6,6 +6,7 @@ struct strip_zone { + sector_t zone_end; /* Start of the next zone (in sectors) */ + sector_t dev_start; /* Zone offset in real dev (in sectors) */ + int nb_dev; /* # of devices attached to the zone */ ++ int disk_shift; /* start disk for the original layout */ + }; + + /* Linux 3.14 (20d0189b101) made an unintended change to diff --git a/queue-4.19/series b/queue-4.19/series index 8800ae76fbc..97321b5ef24 100644 --- a/queue-4.19/series +++ b/queue-4.19/series @@ -170,3 +170,5 @@ pci-rockchip-fix-legacy-irq-generation-for-rk3399-pcie-endpoint-core.patch pci-rockchip-use-u32-variable-to-access-32-bit-registers.patch misc-pci_endpoint_test-free-irqs-before-removing-the-device.patch misc-pci_endpoint_test-re-init-completion-for-every-test.patch +md-raid0-add-discard-support-for-the-original-layout.patch +fs-dlm-return-positive-pid-value-for-f_getlk.patch -- 2.47.3