From 9127c5459f9600f66efe8b10349bcfe3c0ca3eb8 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Sat, 1 Oct 2022 18:03:41 -0400 Subject: [PATCH] Fixes for 4.9 Signed-off-by: Sasha Levin --- ...umber-of-retries-after-discarding-pr.patch | 80 ++++++++ ...roduce-pcpu-seqcnt-for-freeing-pa-to.patch | 194 ++++++++++++++++++ ...actor-ext4_mb_discard_preallocations.patch | 68 ++++++ queue-4.9/series | 4 + ...ption-improve-quectel-ep06-detection.patch | 86 ++++++++ 5 files changed, 432 insertions(+) create mode 100644 queue-4.9/ext4-limit-the-number-of-retries-after-discarding-pr.patch create mode 100644 queue-4.9/ext4-mballoc-introduce-pcpu-seqcnt-for-freeing-pa-to.patch create mode 100644 queue-4.9/ext4-mballoc-refactor-ext4_mb_discard_preallocations.patch create mode 100644 queue-4.9/series create mode 100644 queue-4.9/usb-serial-option-improve-quectel-ep06-detection.patch diff --git a/queue-4.9/ext4-limit-the-number-of-retries-after-discarding-pr.patch b/queue-4.9/ext4-limit-the-number-of-retries-after-discarding-pr.patch new file mode 100644 index 00000000000..8b241607941 --- /dev/null +++ b/queue-4.9/ext4-limit-the-number-of-retries-after-discarding-pr.patch @@ -0,0 +1,80 @@ +From f77dd733cbe77b373086ddff6d8d07c727cc1fbf Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 1 Sep 2022 18:03:14 -0400 +Subject: ext4: limit the number of retries after discarding preallocations + blocks + +From: Theodore Ts'o + +[ Upstream commit 80fa46d6b9e7b1527bfd2197d75431fd9c382161 ] + +This patch avoids threads live-locking for hours when a large number +threads are competing over the last few free extents as they blocks +getting added and removed from preallocation pools. From our bug +reporter: + + A reliable way for triggering this has multiple writers + continuously write() to files when the filesystem is full, while + small amounts of space are freed (e.g. by truncating a large file + -1MiB at a time). In the local filesystem, this can be done by + simply not checking the return code of write (0) and/or the error + (ENOSPACE) that is set. Over NFS with an async mount, even clients + with proper error checking will behave this way since the linux NFS + client implementation will not propagate the server errors [the + write syscalls immediately return success] until the file handle is + closed. This leads to a situation where NFS clients send a + continuous stream of WRITE rpcs which result in ERRNOSPACE -- but + since the client isn't seeing this, the stream of writes continues + at maximum network speed. + + When some space does appear, multiple writers will all attempt to + claim it for their current write. For NFS, we may see dozens to + hundreds of threads that do this. + + The real-world scenario of this is database backup tooling (in + particular, github.com/mdkent/percona-xtrabackup) which may write + large files (>1TiB) to NFS for safe keeping. Some temporary files + are written, rewound, and read back -- all before closing the file + handle (the temp file is actually unlinked, to trigger automatic + deletion on close/crash.) An application like this operating on an + async NFS mount will not see an error code until TiB have been + written/read. + + The lockup was observed when running this database backup on large + filesystems (64 TiB in this case) with a high number of block + groups and no free space. Fragmentation is generally not a factor + in this filesystem (~thousands of large files, mostly contiguous + except for the parts written while the filesystem is at capacity.) + +Signed-off-by: Theodore Ts'o +Cc: stable@kernel.org +Signed-off-by: Sasha Levin +--- + fs/ext4/mballoc.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c +index d4c9b43adb0b..955b60e449d1 100644 +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -4534,6 +4534,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, + ext4_fsblk_t block = 0; + unsigned int inquota = 0; + unsigned int reserv_clstrs = 0; ++ int retries = 0; + u64 seq; + + might_sleep(); +@@ -4628,7 +4629,8 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, + ar->len = ac->ac_b_ex.fe_len; + } + } else { +- if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) ++ if (++retries < 3 && ++ ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) + goto repeat; + *errp = -ENOSPC; + } +-- +2.35.1 + diff --git a/queue-4.9/ext4-mballoc-introduce-pcpu-seqcnt-for-freeing-pa-to.patch b/queue-4.9/ext4-mballoc-introduce-pcpu-seqcnt-for-freeing-pa-to.patch new file mode 100644 index 00000000000..ab81332658b --- /dev/null +++ b/queue-4.9/ext4-mballoc-introduce-pcpu-seqcnt-for-freeing-pa-to.patch @@ -0,0 +1,194 @@ +From 77dc6804f2242e5d54162c330392147da6def41c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 20 May 2020 12:10:34 +0530 +Subject: ext4: mballoc: introduce pcpu seqcnt for freeing PA to improve ENOSPC + handling + +From: Ritesh Harjani + +[ Upstream commit 07b5b8e1ac4004b7db1065a301df65cd434c31c9 ] + +There could be a race in function ext4_mb_discard_group_preallocations() +where the 1st thread may iterate through group's bb_prealloc_list and +remove all the PAs and add to function's local list head. +Now if the 2nd thread comes in to discard the group preallocations, +it will see that the group->bb_prealloc_list is empty and will return 0. + +Consider for a case where we have less number of groups +(for e.g. just group 0), +this may even return an -ENOSPC error from ext4_mb_new_blocks() +(where we call for ext4_mb_discard_group_preallocations()). +But that is wrong, since 2nd thread should have waited for 1st thread +to release all the PAs and should have retried for allocation. +Since 1st thread was anyway going to discard the PAs. + +The algorithm using this percpu seq counter goes below: +1. We sample the percpu discard_pa_seq counter before trying for block + allocation in ext4_mb_new_blocks(). +2. We increment this percpu discard_pa_seq counter when we either allocate + or free these blocks i.e. while marking those blocks as used/free in + mb_mark_used()/mb_free_blocks(). +3. We also increment this percpu seq counter when we successfully identify + that the bb_prealloc_list is not empty and hence proceed for discarding + of those PAs inside ext4_mb_discard_group_preallocations(). + +Now to make sure that the regular fast path of block allocation is not +affected, as a small optimization we only sample the percpu seq counter +on that cpu. Only when the block allocation fails and when freed blocks +found were 0, that is when we sample percpu seq counter for all cpus using +below function ext4_get_discard_pa_seq_sum(). This happens after making +sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty. + +It can be well argued that why don't just check for grp->bb_free to +see if there are any free blocks to be allocated. So here are the two +concerns which were discussed:- + +1. If for some reason the blocks available in the group are not + appropriate for allocation logic (say for e.g. + EXT4_MB_HINT_GOAL_ONLY, although this is not yet implemented), then + the retry logic may result into infinte looping since grp->bb_free is + non-zero. + +2. Also before preallocation was clubbed with block allocation with the + same ext4_lock_group() held, there were lot of races where grp->bb_free + could not be reliably relied upon. +Due to above, this patch considers discard_pa_seq logic to determine if +we should retry for block allocation. Say if there are are n threads +trying for block allocation and none of those could allocate or discard +any of the blocks, then all of those n threads will fail the block +allocation and return -ENOSPC error. (Since the seq counter for all of +those will match as no block allocation/discard was done during that +duration). + +Signed-off-by: Ritesh Harjani +Link: https://lore.kernel.org/r/7f254686903b87c419d798742fd9a1be34f0657b.1589955723.git.riteshh@linux.ibm.com +Signed-off-by: Theodore Ts'o +Stable-dep-of: 80fa46d6b9e7 ("ext4: limit the number of retries after discarding preallocations blocks") +Signed-off-by: Sasha Levin +--- + fs/ext4/mballoc.c | 56 ++++++++++++++++++++++++++++++++++++++++++----- + 1 file changed, 51 insertions(+), 5 deletions(-) + +diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c +index eb3eeae73faa..d4c9b43adb0b 100644 +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -371,6 +371,35 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + static void ext4_free_data_callback(struct super_block *sb, + struct ext4_journal_cb_entry *jce, int rc); + ++/* ++ * The algorithm using this percpu seq counter goes below: ++ * 1. We sample the percpu discard_pa_seq counter before trying for block ++ * allocation in ext4_mb_new_blocks(). ++ * 2. We increment this percpu discard_pa_seq counter when we either allocate ++ * or free these blocks i.e. while marking those blocks as used/free in ++ * mb_mark_used()/mb_free_blocks(). ++ * 3. We also increment this percpu seq counter when we successfully identify ++ * that the bb_prealloc_list is not empty and hence proceed for discarding ++ * of those PAs inside ext4_mb_discard_group_preallocations(). ++ * ++ * Now to make sure that the regular fast path of block allocation is not ++ * affected, as a small optimization we only sample the percpu seq counter ++ * on that cpu. Only when the block allocation fails and when freed blocks ++ * found were 0, that is when we sample percpu seq counter for all cpus using ++ * below function ext4_get_discard_pa_seq_sum(). This happens after making ++ * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty. ++ */ ++static DEFINE_PER_CPU(u64, discard_pa_seq); ++static inline u64 ext4_get_discard_pa_seq_sum(void) ++{ ++ int __cpu; ++ u64 __seq = 0; ++ ++ for_each_possible_cpu(__cpu) ++ __seq += per_cpu(discard_pa_seq, __cpu); ++ return __seq; ++} ++ + static inline void *mb_correct_addr_and_bit(int *bit, void *addr) + { + #if BITS_PER_LONG == 64 +@@ -1444,6 +1473,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, + mb_check_buddy(e4b); + mb_free_blocks_double(inode, e4b, first, count); + ++ this_cpu_inc(discard_pa_seq); + e4b->bd_info->bb_free += count; + if (first < e4b->bd_info->bb_first_free) + e4b->bd_info->bb_first_free = first; +@@ -1579,6 +1609,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) + mb_check_buddy(e4b); + mb_mark_used_double(e4b, start, len); + ++ this_cpu_inc(discard_pa_seq); + e4b->bd_info->bb_free -= len; + if (e4b->bd_info->bb_first_free == start) + e4b->bd_info->bb_first_free += len; +@@ -3923,6 +3954,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, + INIT_LIST_HEAD(&list); + repeat: + ext4_lock_group(sb, group); ++ this_cpu_inc(discard_pa_seq); + list_for_each_entry_safe(pa, tmp, + &grp->bb_prealloc_list, pa_group_list) { + spin_lock(&pa->pa_lock); +@@ -4466,14 +4498,26 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) + } + + static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb, +- struct ext4_allocation_context *ac) ++ struct ext4_allocation_context *ac, u64 *seq) + { + int freed; ++ u64 seq_retry = 0; ++ bool ret = false; + + freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); +- if (freed) +- return true; +- return false; ++ if (freed) { ++ ret = true; ++ goto out_dbg; ++ } ++ seq_retry = ext4_get_discard_pa_seq_sum(); ++ if (seq_retry != *seq) { ++ *seq = seq_retry; ++ ret = true; ++ } ++ ++out_dbg: ++ mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no"); ++ return ret; + } + + /* +@@ -4490,6 +4534,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, + ext4_fsblk_t block = 0; + unsigned int inquota = 0; + unsigned int reserv_clstrs = 0; ++ u64 seq; + + might_sleep(); + sb = ar->inode->i_sb; +@@ -4551,6 +4596,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, + } + + ac->ac_op = EXT4_MB_HISTORY_PREALLOC; ++ seq = *this_cpu_ptr(&discard_pa_seq); + if (!ext4_mb_use_preallocated(ac)) { + ac->ac_op = EXT4_MB_HISTORY_ALLOC; + ext4_mb_normalize_request(ac, ar); +@@ -4582,7 +4628,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, + ar->len = ac->ac_b_ex.fe_len; + } + } else { +- if (ext4_mb_discard_preallocations_should_retry(sb, ac)) ++ if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) + goto repeat; + *errp = -ENOSPC; + } +-- +2.35.1 + diff --git a/queue-4.9/ext4-mballoc-refactor-ext4_mb_discard_preallocations.patch b/queue-4.9/ext4-mballoc-refactor-ext4_mb_discard_preallocations.patch new file mode 100644 index 00000000000..d392cb90940 --- /dev/null +++ b/queue-4.9/ext4-mballoc-refactor-ext4_mb_discard_preallocations.patch @@ -0,0 +1,68 @@ +From efe8e20f4a0346002421e900257f48bf0577ae20 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 20 May 2020 12:10:33 +0530 +Subject: ext4: mballoc: refactor ext4_mb_discard_preallocations() + +From: Ritesh Harjani + +[ Upstream commit cf5e2ca6c99077d128e971149f0c262e808ca831 ] + +Implement ext4_mb_discard_preallocations_should_retry() +which we will need in later patches to add more logic +like check for sequence number match to see if we should +retry for block allocation or not. + +There should be no functionality change in this patch. + +Signed-off-by: Ritesh Harjani +Link: https://lore.kernel.org/r/1cfae0098d2aa9afbeb59331401258182868c8f2.1589955723.git.riteshh@linux.ibm.com +Signed-off-by: Theodore Ts'o +Stable-dep-of: 80fa46d6b9e7 ("ext4: limit the number of retries after discarding preallocations blocks") +Signed-off-by: Sasha Levin +--- + fs/ext4/mballoc.c | 15 ++++++++++++--- + 1 file changed, 12 insertions(+), 3 deletions(-) + +diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c +index f39b65601233..eb3eeae73faa 100644 +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -4465,6 +4465,17 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) + return freed; + } + ++static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb, ++ struct ext4_allocation_context *ac) ++{ ++ int freed; ++ ++ freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); ++ if (freed) ++ return true; ++ return false; ++} ++ + /* + * Main entry point into mballoc to allocate blocks + * it tries to use preallocation first, then falls back +@@ -4473,7 +4484,6 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) + ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, + struct ext4_allocation_request *ar, int *errp) + { +- int freed; + struct ext4_allocation_context *ac = NULL; + struct ext4_sb_info *sbi; + struct super_block *sb; +@@ -4572,8 +4582,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, + ar->len = ac->ac_b_ex.fe_len; + } + } else { +- freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); +- if (freed) ++ if (ext4_mb_discard_preallocations_should_retry(sb, ac)) + goto repeat; + *errp = -ENOSPC; + } +-- +2.35.1 + diff --git a/queue-4.9/series b/queue-4.9/series new file mode 100644 index 00000000000..548fdf144c5 --- /dev/null +++ b/queue-4.9/series @@ -0,0 +1,4 @@ +usb-serial-option-improve-quectel-ep06-detection.patch +ext4-mballoc-refactor-ext4_mb_discard_preallocations.patch +ext4-mballoc-introduce-pcpu-seqcnt-for-freeing-pa-to.patch +ext4-limit-the-number-of-retries-after-discarding-pr.patch diff --git a/queue-4.9/usb-serial-option-improve-quectel-ep06-detection.patch b/queue-4.9/usb-serial-option-improve-quectel-ep06-detection.patch new file mode 100644 index 00000000000..b1d21ff4f3a --- /dev/null +++ b/queue-4.9/usb-serial-option-improve-quectel-ep06-detection.patch @@ -0,0 +1,86 @@ +From fc6c931409726629804ee4c8d1388930742bc279 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 13 Sep 2018 11:21:49 +0200 +Subject: USB: serial: option: improve Quectel EP06 detection + +From: Kristian Evensen + +[ Upstream commit 36cae568404a298a19a6e8a3f18641075d4cab04 ] + +The Quectel EP06 (and EM06/EG06) LTE modem supports updating the USB +configuration, without the VID/PID or configuration number changing. +When the configuration is updated and interfaces are added/removed, the +interface numbers are updated. This causes our current code for matching +EP06 not to work as intended, as the assumption about reserved +interfaces no longer holds. If for example the diagnostic (first) +interface is removed, option will (try to) bind to the QMI interface. + +This patch improves EP06 detection by replacing the current match with +two matches, and those matches check class, subclass and protocol as +well as VID and PID. The diag interface exports class, subclass and +protocol as 0xff. For the other serial interfaces, class is 0xff and +subclass and protocol are both 0x0. + +The modem can export the following devices and always in this order: +diag, nmea, at, ppp. qmi and adb. This means that diag can only ever be +interface 0, and interface numbers 1-5 should be marked as reserved. The +three other serial devices can have interface numbers 0-3, but I have +not marked any interfaces as reserved. The reason is that the serial +devices are the only interfaces exported by the device where subclass +and protocol is 0x0. + +QMI exports the same class, subclass and protocol values as the diag +interface. However, the two interfaces have different number of +endpoints, QMI has three and diag two. I have added a check for number +of interfaces if VID/PID matches the EP06, and we ignore the device if +number of interfaces equals three (and subclass is set). + +Signed-off-by: Kristian Evensen +Acked-by: Dan Williams +[ johan: drop uneeded RSVD(5) for ADB ] +Cc: stable +Signed-off-by: Johan Hovold +Stable-dep-of: f8f67eff6847 ("USB: serial: option: add Quectel BG95 0x0203 composition") +Signed-off-by: Sasha Levin +--- + drivers/usb/serial/option.c | 17 +++++++++++++++-- + 1 file changed, 15 insertions(+), 2 deletions(-) + +diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c +index dc39243688e4..7b54a58bb8e7 100644 +--- a/drivers/usb/serial/option.c ++++ b/drivers/usb/serial/option.c +@@ -1126,8 +1126,9 @@ static const struct usb_device_id option_ids[] = { + .driver_info = RSVD(4) }, + { USB_DEVICE(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_BG96), + .driver_info = RSVD(4) }, +- { USB_DEVICE(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EP06), +- .driver_info = RSVD(4) | RSVD(5) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EP06, 0xff, 0xff, 0xff), ++ .driver_info = RSVD(1) | RSVD(2) | RSVD(3) | RSVD(4) }, ++ { USB_DEVICE_AND_INTERFACE_INFO(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EP06, 0xff, 0, 0) }, + { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_6001) }, + { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_CMU_300) }, + { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_6003), +@@ -2216,6 +2217,18 @@ static int option_probe(struct usb_serial *serial, + iface_desc->bInterfaceClass != USB_CLASS_CDC_DATA) + return -ENODEV; + ++ /* ++ * Don't bind to the QMI device of the Quectel EP06/EG06/EM06. Class, ++ * subclass and protocol is 0xff for both the diagnostic port and the ++ * QMI interface, but the diagnostic port only has two endpoints (QMI ++ * has three). ++ */ ++ if (dev_desc->idVendor == cpu_to_le16(QUECTEL_VENDOR_ID) && ++ dev_desc->idProduct == cpu_to_le16(QUECTEL_PRODUCT_EP06) && ++ iface_desc->bInterfaceSubClass && iface_desc->bNumEndpoints == 3) { ++ return -ENODEV; ++ } ++ + /* Store the device flags so we can use them during attach. */ + usb_set_serial_data(serial, (void *)device_flags); + +-- +2.35.1 + -- 2.47.3