From: Greg Kroah-Hartman Date: Mon, 31 Aug 2020 10:08:35 +0000 (+0200) Subject: 4.4-stable patches X-Git-Tag: v4.4.235~47 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=bf6afaf320ed19a4f588efb785b2cac26c4e423d;p=thirdparty%2Fkernel%2Fstable-queue.git 4.4-stable patches added patches: writeback-avoid-skipping-inode-writeback.patch writeback-fix-sync-livelock-due-to-b_dirty_time-processing.patch writeback-protect-inode-i_io_list-with-inode-i_lock.patch --- diff --git a/queue-4.4/series b/queue-4.4/series index 1f9136d8e7f..876239d7b7f 100644 --- a/queue-4.4/series +++ b/queue-4.4/series @@ -44,3 +44,6 @@ vt_ioctl-change-vt_resizex-ioctl-to-check-for-error-return-from-vc_resize.patch serial-samsung-removes-the-irq-not-found-warning.patch serial-pl011-don-t-leak-amba_ports-entry-on-driver-register-error.patch serial-8250-change-lock-order-in-serial8250_do_startup.patch +writeback-protect-inode-i_io_list-with-inode-i_lock.patch +writeback-avoid-skipping-inode-writeback.patch +writeback-fix-sync-livelock-due-to-b_dirty_time-processing.patch diff --git a/queue-4.4/writeback-avoid-skipping-inode-writeback.patch b/queue-4.4/writeback-avoid-skipping-inode-writeback.patch new file mode 100644 index 00000000000..995151fe00b --- /dev/null +++ b/queue-4.4/writeback-avoid-skipping-inode-writeback.patch @@ -0,0 +1,146 @@ +From 5afced3bf28100d81fb2fe7e98918632a08feaf5 Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Fri, 29 May 2020 15:05:22 +0200 +Subject: writeback: Avoid skipping inode writeback + +From: Jan Kara + +commit 5afced3bf28100d81fb2fe7e98918632a08feaf5 upstream. + +Inode's i_io_list list head is used to attach inode to several different +lists - wb->{b_dirty, b_dirty_time, b_io, b_more_io}. When flush worker +prepares a list of inodes to writeback e.g. for sync(2), it moves inodes +to b_io list. Thus it is critical for sync(2) data integrity guarantees +that inode is not requeued to any other writeback list when inode is +queued for processing by flush worker. That's the reason why +writeback_single_inode() does not touch i_io_list (unless the inode is +completely clean) and why __mark_inode_dirty() does not touch i_io_list +if I_SYNC flag is set. + +However there are two flaws in the current logic: + +1) When inode has only I_DIRTY_TIME set but it is already queued in b_io +list due to sync(2), concurrent __mark_inode_dirty(inode, I_DIRTY_SYNC) +can still move inode back to b_dirty list resulting in skipping +writeback of inode time stamps during sync(2). + +2) When inode is on b_dirty_time list and writeback_single_inode() races +with __mark_inode_dirty() like: + +writeback_single_inode() __mark_inode_dirty(inode, I_DIRTY_PAGES) + inode->i_state |= I_SYNC + __writeback_single_inode() + inode->i_state |= I_DIRTY_PAGES; + if (inode->i_state & I_SYNC) + bail + if (!(inode->i_state & I_DIRTY_ALL)) + - not true so nothing done + +We end up with I_DIRTY_PAGES inode on b_dirty_time list and thus +standard background writeback will not writeback this inode leading to +possible dirty throttling stalls etc. (thanks to Martijn Coenen for this +analysis). + +Fix these problems by tracking whether inode is queued in b_io or +b_more_io lists in a new I_SYNC_QUEUED flag. When this flag is set, we +know flush worker has queued inode and we should not touch i_io_list. +On the other hand we also know that once flush worker is done with the +inode it will requeue the inode to appropriate dirty list. When +I_SYNC_QUEUED is not set, __mark_inode_dirty() can (and must) move inode +to appropriate dirty list. + +Reported-by: Martijn Coenen +Reviewed-by: Martijn Coenen +Tested-by: Martijn Coenen +Reviewed-by: Christoph Hellwig +Fixes: 0ae45f63d4ef ("vfs: add support for a lazytime mount option") +CC: stable@vger.kernel.org +Signed-off-by: Jan Kara +Signed-off-by: Greg Kroah-Hartman + +--- + fs/fs-writeback.c | 17 ++++++++++++----- + include/linux/fs.h | 8 ++++++-- + 2 files changed, 18 insertions(+), 7 deletions(-) + +--- a/fs/fs-writeback.c ++++ b/fs/fs-writeback.c +@@ -162,6 +162,7 @@ static void inode_io_list_del_locked(str + assert_spin_locked(&wb->list_lock); + assert_spin_locked(&inode->i_lock); + ++ inode->i_state &= ~I_SYNC_QUEUED; + list_del_init(&inode->i_io_list); + wb_io_lists_depopulated(wb); + } +@@ -1062,6 +1063,7 @@ static void redirty_tail_locked(struct i + inode->dirtied_when = jiffies; + } + inode_io_list_move_locked(inode, wb, &wb->b_dirty); ++ inode->i_state &= ~I_SYNC_QUEUED; + } + + static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) +@@ -1137,8 +1139,11 @@ static int move_expired_inodes(struct li + break; + list_move(&inode->i_io_list, &tmp); + moved++; ++ spin_lock(&inode->i_lock); + if (flags & EXPIRE_DIRTY_ATIME) +- set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state); ++ inode->i_state |= I_DIRTY_TIME_EXPIRED; ++ inode->i_state |= I_SYNC_QUEUED; ++ spin_unlock(&inode->i_lock); + if (sb_is_blkdev_sb(inode->i_sb)) + continue; + if (sb && sb != inode->i_sb) +@@ -1313,6 +1318,7 @@ static void requeue_inode(struct inode * + } else if (inode->i_state & I_DIRTY_TIME) { + inode->dirtied_when = jiffies; + inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); ++ inode->i_state &= ~I_SYNC_QUEUED; + } else { + /* The inode is clean. Remove from writeback lists. */ + inode_io_list_del_locked(inode, wb); +@@ -2140,11 +2146,12 @@ void __mark_inode_dirty(struct inode *in + inode->i_state |= flags; + + /* +- * If the inode is being synced, just update its dirty state. +- * The unlocker will place the inode on the appropriate +- * superblock list, based upon its state. ++ * If the inode is queued for writeback by flush worker, just ++ * update its dirty state. Once the flush worker is done with ++ * the inode it will place it on the appropriate superblock ++ * list, based upon its state. + */ +- if (inode->i_state & I_SYNC) ++ if (inode->i_state & I_SYNC_QUEUED) + goto out_unlock_inode; + + /* +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -1882,6 +1882,10 @@ struct super_operations { + * wb stat updates to grab mapping->tree_lock. See + * inode_switch_wb_work_fn() for details. + * ++ * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists. ++ * Used to detect that mark_inode_dirty() should not move ++ * inode between dirty lists. ++ * + * Q: What is the difference between I_WILL_FREE and I_FREEING? + */ + #define I_DIRTY_SYNC (1 << 0) +@@ -1899,9 +1903,9 @@ struct super_operations { + #define I_DIO_WAKEUP (1 << __I_DIO_WAKEUP) + #define I_LINKABLE (1 << 10) + #define I_DIRTY_TIME (1 << 11) +-#define __I_DIRTY_TIME_EXPIRED 12 +-#define I_DIRTY_TIME_EXPIRED (1 << __I_DIRTY_TIME_EXPIRED) ++#define I_DIRTY_TIME_EXPIRED (1 << 12) + #define I_WB_SWITCH (1 << 13) ++#define I_SYNC_QUEUED (1 << 17) + + #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) + #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) diff --git a/queue-4.4/writeback-fix-sync-livelock-due-to-b_dirty_time-processing.patch b/queue-4.4/writeback-fix-sync-livelock-due-to-b_dirty_time-processing.patch new file mode 100644 index 00000000000..f6c13e89148 --- /dev/null +++ b/queue-4.4/writeback-fix-sync-livelock-due-to-b_dirty_time-processing.patch @@ -0,0 +1,193 @@ +From f9cae926f35e8230330f28c7b743ad088611a8de Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Fri, 29 May 2020 16:08:58 +0200 +Subject: writeback: Fix sync livelock due to b_dirty_time processing + +From: Jan Kara + +commit f9cae926f35e8230330f28c7b743ad088611a8de upstream. + +When we are processing writeback for sync(2), move_expired_inodes() +didn't set any inode expiry value (older_than_this). This can result in +writeback never completing if there's steady stream of inodes added to +b_dirty_time list as writeback rechecks dirty lists after each writeback +round whether there's more work to be done. Fix the problem by using +sync(2) start time is inode expiry value when processing b_dirty_time +list similarly as for ordinarily dirtied inodes. This requires some +refactoring of older_than_this handling which simplifies the code +noticeably as a bonus. + +Fixes: 0ae45f63d4ef ("vfs: add support for a lazytime mount option") +CC: stable@vger.kernel.org +Reviewed-by: Christoph Hellwig +Signed-off-by: Jan Kara +Signed-off-by: Greg Kroah-Hartman + +--- + fs/fs-writeback.c | 44 +++++++++++++++------------------------ + include/trace/events/writeback.h | 13 +++++------ + 2 files changed, 23 insertions(+), 34 deletions(-) + +--- a/fs/fs-writeback.c ++++ b/fs/fs-writeback.c +@@ -45,7 +45,6 @@ struct wb_completion { + struct wb_writeback_work { + long nr_pages; + struct super_block *sb; +- unsigned long *older_than_this; + enum writeback_sync_modes sync_mode; + unsigned int tagged_writepages:1; + unsigned int for_kupdate:1; +@@ -1109,16 +1108,13 @@ static bool inode_dirtied_after(struct i + #define EXPIRE_DIRTY_ATIME 0x0001 + + /* +- * Move expired (dirtied before work->older_than_this) dirty inodes from ++ * Move expired (dirtied before dirtied_before) dirty inodes from + * @delaying_queue to @dispatch_queue. + */ + static int move_expired_inodes(struct list_head *delaying_queue, + struct list_head *dispatch_queue, +- int flags, +- struct wb_writeback_work *work) ++ int flags, unsigned long dirtied_before) + { +- unsigned long *older_than_this = NULL; +- unsigned long expire_time; + LIST_HEAD(tmp); + struct list_head *pos, *node; + struct super_block *sb = NULL; +@@ -1126,16 +1122,9 @@ static int move_expired_inodes(struct li + int do_sb_sort = 0; + int moved = 0; + +- if ((flags & EXPIRE_DIRTY_ATIME) == 0) +- older_than_this = work->older_than_this; +- else if (!work->for_sync) { +- expire_time = jiffies - (dirtytime_expire_interval * HZ); +- older_than_this = &expire_time; +- } + while (!list_empty(delaying_queue)) { + inode = wb_inode(delaying_queue->prev); +- if (older_than_this && +- inode_dirtied_after(inode, *older_than_this)) ++ if (inode_dirtied_after(inode, dirtied_before)) + break; + list_move(&inode->i_io_list, &tmp); + moved++; +@@ -1181,18 +1170,22 @@ out: + * | + * +--> dequeue for IO + */ +-static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) ++static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work, ++ unsigned long dirtied_before) + { + int moved; ++ unsigned long time_expire_jif = dirtied_before; + + assert_spin_locked(&wb->list_lock); + list_splice_init(&wb->b_more_io, &wb->b_io); +- moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work); ++ moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, dirtied_before); ++ if (!work->for_sync) ++ time_expire_jif = jiffies - dirtytime_expire_interval * HZ; + moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io, +- EXPIRE_DIRTY_ATIME, work); ++ EXPIRE_DIRTY_ATIME, time_expire_jif); + if (moved) + wb_io_lists_populated(wb); +- trace_writeback_queue_io(wb, work, moved); ++ trace_writeback_queue_io(wb, work, dirtied_before, moved); + } + + static int write_inode(struct inode *inode, struct writeback_control *wbc) +@@ -1703,7 +1696,7 @@ static long writeback_inodes_wb(struct b + blk_start_plug(&plug); + spin_lock(&wb->list_lock); + if (list_empty(&wb->b_io)) +- queue_io(wb, &work); ++ queue_io(wb, &work, jiffies); + __writeback_inodes_wb(wb, &work); + spin_unlock(&wb->list_lock); + blk_finish_plug(&plug); +@@ -1723,7 +1716,7 @@ static long writeback_inodes_wb(struct b + * takes longer than a dirty_writeback_interval interval, then leave a + * one-second gap. + * +- * older_than_this takes precedence over nr_to_write. So we'll only write back ++ * dirtied_before takes precedence over nr_to_write. So we'll only write back + * all dirty pages if they are all attached to "old" mappings. + */ + static long wb_writeback(struct bdi_writeback *wb, +@@ -1731,14 +1724,11 @@ static long wb_writeback(struct bdi_writ + { + unsigned long wb_start = jiffies; + long nr_pages = work->nr_pages; +- unsigned long oldest_jif; ++ unsigned long dirtied_before = jiffies; + struct inode *inode; + long progress; + struct blk_plug plug; + +- oldest_jif = jiffies; +- work->older_than_this = &oldest_jif; +- + blk_start_plug(&plug); + spin_lock(&wb->list_lock); + for (;;) { +@@ -1772,14 +1762,14 @@ static long wb_writeback(struct bdi_writ + * safe. + */ + if (work->for_kupdate) { +- oldest_jif = jiffies - ++ dirtied_before = jiffies - + msecs_to_jiffies(dirty_expire_interval * 10); + } else if (work->for_background) +- oldest_jif = jiffies; ++ dirtied_before = jiffies; + + trace_writeback_start(wb, work); + if (list_empty(&wb->b_io)) +- queue_io(wb, work); ++ queue_io(wb, work, dirtied_before); + if (work->sb) + progress = writeback_sb_inodes(work->sb, wb, work); + else +--- a/include/trace/events/writeback.h ++++ b/include/trace/events/writeback.h +@@ -390,8 +390,9 @@ DEFINE_WBC_EVENT(wbc_writepage); + TRACE_EVENT(writeback_queue_io, + TP_PROTO(struct bdi_writeback *wb, + struct wb_writeback_work *work, ++ unsigned long dirtied_before, + int moved), +- TP_ARGS(wb, work, moved), ++ TP_ARGS(wb, work, dirtied_before, moved), + TP_STRUCT__entry( + __array(char, name, 32) + __field(unsigned long, older) +@@ -401,19 +402,17 @@ TRACE_EVENT(writeback_queue_io, + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb)) + ), + TP_fast_assign( +- unsigned long *older_than_this = work->older_than_this; + strncpy(__entry->name, dev_name(wb->bdi->dev), 32); +- __entry->older = older_than_this ? *older_than_this : 0; +- __entry->age = older_than_this ? +- (jiffies - *older_than_this) * 1000 / HZ : -1; ++ __entry->older = dirtied_before; ++ __entry->age = (jiffies - dirtied_before) * 1000 / HZ; + __entry->moved = moved; + __entry->reason = work->reason; + __trace_wb_assign_cgroup(__get_str(cgroup), wb); + ), + TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup=%s", + __entry->name, +- __entry->older, /* older_than_this in jiffies */ +- __entry->age, /* older_than_this in relative milliseconds */ ++ __entry->older, /* dirtied_before in jiffies */ ++ __entry->age, /* dirtied_before in relative milliseconds */ + __entry->moved, + __print_symbolic(__entry->reason, WB_WORK_REASON), + __get_str(cgroup) diff --git a/queue-4.4/writeback-protect-inode-i_io_list-with-inode-i_lock.patch b/queue-4.4/writeback-protect-inode-i_io_list-with-inode-i_lock.patch new file mode 100644 index 00000000000..297a7001fe3 --- /dev/null +++ b/queue-4.4/writeback-protect-inode-i_io_list-with-inode-i_lock.patch @@ -0,0 +1,107 @@ +From b35250c0816c7cf7d0a8de92f5fafb6a7508a708 Mon Sep 17 00:00:00 2001 +From: Jan Kara +Date: Wed, 10 Jun 2020 17:36:03 +0200 +Subject: writeback: Protect inode->i_io_list with inode->i_lock + +From: Jan Kara + +commit b35250c0816c7cf7d0a8de92f5fafb6a7508a708 upstream. + +Currently, operations on inode->i_io_list are protected by +wb->list_lock. In the following patches we'll need to maintain +consistency between inode->i_state and inode->i_io_list so change the +code so that inode->i_lock protects also all inode's i_io_list handling. + +Reviewed-by: Martijn Coenen +Reviewed-by: Christoph Hellwig +CC: stable@vger.kernel.org # Prerequisite for "writeback: Avoid skipping inode writeback" +Signed-off-by: Jan Kara +Signed-off-by: Greg Kroah-Hartman + +--- + fs/fs-writeback.c | 22 +++++++++++++++++----- + 1 file changed, 17 insertions(+), 5 deletions(-) + +--- a/fs/fs-writeback.c ++++ b/fs/fs-writeback.c +@@ -160,6 +160,7 @@ static void inode_io_list_del_locked(str + struct bdi_writeback *wb) + { + assert_spin_locked(&wb->list_lock); ++ assert_spin_locked(&inode->i_lock); + + list_del_init(&inode->i_io_list); + wb_io_lists_depopulated(wb); +@@ -1034,7 +1035,9 @@ void inode_io_list_del(struct inode *ino + struct bdi_writeback *wb; + + wb = inode_to_wb_and_lock_list(inode); ++ spin_lock(&inode->i_lock); + inode_io_list_del_locked(inode, wb); ++ spin_unlock(&inode->i_lock); + spin_unlock(&wb->list_lock); + } + +@@ -1047,8 +1050,10 @@ void inode_io_list_del(struct inode *ino + * the case then the inode must have been redirtied while it was being written + * out and we don't reset its dirtied_when. + */ +-static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) ++static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb) + { ++ assert_spin_locked(&inode->i_lock); ++ + if (!list_empty(&wb->b_dirty)) { + struct inode *tail; + +@@ -1059,6 +1064,13 @@ static void redirty_tail(struct inode *i + inode_io_list_move_locked(inode, wb, &wb->b_dirty); + } + ++static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) ++{ ++ spin_lock(&inode->i_lock); ++ redirty_tail_locked(inode, wb); ++ spin_unlock(&inode->i_lock); ++} ++ + /* + * requeue inode for re-scanning after bdi->b_io list is exhausted. + */ +@@ -1269,7 +1281,7 @@ static void requeue_inode(struct inode * + * writeback is not making progress due to locked + * buffers. Skip this inode for now. + */ +- redirty_tail(inode, wb); ++ redirty_tail_locked(inode, wb); + return; + } + +@@ -1289,7 +1301,7 @@ static void requeue_inode(struct inode * + * retrying writeback of the dirty page/inode + * that cannot be performed immediately. + */ +- redirty_tail(inode, wb); ++ redirty_tail_locked(inode, wb); + } + } else if (inode->i_state & I_DIRTY) { + /* +@@ -1297,7 +1309,7 @@ static void requeue_inode(struct inode * + * such as delayed allocation during submission or metadata + * updates after data IO completion. + */ +- redirty_tail(inode, wb); ++ redirty_tail_locked(inode, wb); + } else if (inode->i_state & I_DIRTY_TIME) { + inode->dirtied_when = jiffies; + inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); +@@ -1543,8 +1555,8 @@ static long writeback_sb_inodes(struct s + */ + spin_lock(&inode->i_lock); + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { ++ redirty_tail_locked(inode, wb); + spin_unlock(&inode->i_lock); +- redirty_tail(inode, wb); + continue; + } + if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {