]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.4-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 31 Aug 2020 10:08:35 +0000 (12:08 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 31 Aug 2020 10:08:35 +0000 (12:08 +0200)
added patches:
writeback-avoid-skipping-inode-writeback.patch
writeback-fix-sync-livelock-due-to-b_dirty_time-processing.patch
writeback-protect-inode-i_io_list-with-inode-i_lock.patch

queue-4.4/series
queue-4.4/writeback-avoid-skipping-inode-writeback.patch [new file with mode: 0644]
queue-4.4/writeback-fix-sync-livelock-due-to-b_dirty_time-processing.patch [new file with mode: 0644]
queue-4.4/writeback-protect-inode-i_io_list-with-inode-i_lock.patch [new file with mode: 0644]

index 1f9136d8e7f25383bef856564c1714167424ee48..876239d7b7f40716f6bbf5182454c713d8381f78 100644 (file)
@@ -44,3 +44,6 @@ vt_ioctl-change-vt_resizex-ioctl-to-check-for-error-return-from-vc_resize.patch
 serial-samsung-removes-the-irq-not-found-warning.patch
 serial-pl011-don-t-leak-amba_ports-entry-on-driver-register-error.patch
 serial-8250-change-lock-order-in-serial8250_do_startup.patch
+writeback-protect-inode-i_io_list-with-inode-i_lock.patch
+writeback-avoid-skipping-inode-writeback.patch
+writeback-fix-sync-livelock-due-to-b_dirty_time-processing.patch
diff --git a/queue-4.4/writeback-avoid-skipping-inode-writeback.patch b/queue-4.4/writeback-avoid-skipping-inode-writeback.patch
new file mode 100644 (file)
index 0000000..995151f
--- /dev/null
@@ -0,0 +1,146 @@
+From 5afced3bf28100d81fb2fe7e98918632a08feaf5 Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Fri, 29 May 2020 15:05:22 +0200
+Subject: writeback: Avoid skipping inode writeback
+
+From: Jan Kara <jack@suse.cz>
+
+commit 5afced3bf28100d81fb2fe7e98918632a08feaf5 upstream.
+
+Inode's i_io_list list head is used to attach inode to several different
+lists - wb->{b_dirty, b_dirty_time, b_io, b_more_io}. When flush worker
+prepares a list of inodes to writeback e.g. for sync(2), it moves inodes
+to b_io list. Thus it is critical for sync(2) data integrity guarantees
+that inode is not requeued to any other writeback list when inode is
+queued for processing by flush worker. That's the reason why
+writeback_single_inode() does not touch i_io_list (unless the inode is
+completely clean) and why __mark_inode_dirty() does not touch i_io_list
+if I_SYNC flag is set.
+
+However there are two flaws in the current logic:
+
+1) When inode has only I_DIRTY_TIME set but it is already queued in b_io
+list due to sync(2), concurrent __mark_inode_dirty(inode, I_DIRTY_SYNC)
+can still move inode back to b_dirty list resulting in skipping
+writeback of inode time stamps during sync(2).
+
+2) When inode is on b_dirty_time list and writeback_single_inode() races
+with __mark_inode_dirty() like:
+
+writeback_single_inode()               __mark_inode_dirty(inode, I_DIRTY_PAGES)
+  inode->i_state |= I_SYNC
+  __writeback_single_inode()
+                                         inode->i_state |= I_DIRTY_PAGES;
+                                         if (inode->i_state & I_SYNC)
+                                           bail
+  if (!(inode->i_state & I_DIRTY_ALL))
+  - not true so nothing done
+
+We end up with I_DIRTY_PAGES inode on b_dirty_time list and thus
+standard background writeback will not writeback this inode leading to
+possible dirty throttling stalls etc. (thanks to Martijn Coenen for this
+analysis).
+
+Fix these problems by tracking whether inode is queued in b_io or
+b_more_io lists in a new I_SYNC_QUEUED flag. When this flag is set, we
+know flush worker has queued inode and we should not touch i_io_list.
+On the other hand we also know that once flush worker is done with the
+inode it will requeue the inode to appropriate dirty list. When
+I_SYNC_QUEUED is not set, __mark_inode_dirty() can (and must) move inode
+to appropriate dirty list.
+
+Reported-by: Martijn Coenen <maco@android.com>
+Reviewed-by: Martijn Coenen <maco@android.com>
+Tested-by: Martijn Coenen <maco@android.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Fixes: 0ae45f63d4ef ("vfs: add support for a lazytime mount option")
+CC: stable@vger.kernel.org
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/fs-writeback.c  |   17 ++++++++++++-----
+ include/linux/fs.h |    8 ++++++--
+ 2 files changed, 18 insertions(+), 7 deletions(-)
+
+--- a/fs/fs-writeback.c
++++ b/fs/fs-writeback.c
+@@ -162,6 +162,7 @@ static void inode_io_list_del_locked(str
+       assert_spin_locked(&wb->list_lock);
+       assert_spin_locked(&inode->i_lock);
++      inode->i_state &= ~I_SYNC_QUEUED;
+       list_del_init(&inode->i_io_list);
+       wb_io_lists_depopulated(wb);
+ }
+@@ -1062,6 +1063,7 @@ static void redirty_tail_locked(struct i
+                       inode->dirtied_when = jiffies;
+       }
+       inode_io_list_move_locked(inode, wb, &wb->b_dirty);
++      inode->i_state &= ~I_SYNC_QUEUED;
+ }
+ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
+@@ -1137,8 +1139,11 @@ static int move_expired_inodes(struct li
+                       break;
+               list_move(&inode->i_io_list, &tmp);
+               moved++;
++              spin_lock(&inode->i_lock);
+               if (flags & EXPIRE_DIRTY_ATIME)
+-                      set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
++                      inode->i_state |= I_DIRTY_TIME_EXPIRED;
++              inode->i_state |= I_SYNC_QUEUED;
++              spin_unlock(&inode->i_lock);
+               if (sb_is_blkdev_sb(inode->i_sb))
+                       continue;
+               if (sb && sb != inode->i_sb)
+@@ -1313,6 +1318,7 @@ static void requeue_inode(struct inode *
+       } else if (inode->i_state & I_DIRTY_TIME) {
+               inode->dirtied_when = jiffies;
+               inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
++              inode->i_state &= ~I_SYNC_QUEUED;
+       } else {
+               /* The inode is clean. Remove from writeback lists. */
+               inode_io_list_del_locked(inode, wb);
+@@ -2140,11 +2146,12 @@ void __mark_inode_dirty(struct inode *in
+               inode->i_state |= flags;
+               /*
+-               * If the inode is being synced, just update its dirty state.
+-               * The unlocker will place the inode on the appropriate
+-               * superblock list, based upon its state.
++               * If the inode is queued for writeback by flush worker, just
++               * update its dirty state. Once the flush worker is done with
++               * the inode it will place it on the appropriate superblock
++               * list, based upon its state.
+                */
+-              if (inode->i_state & I_SYNC)
++              if (inode->i_state & I_SYNC_QUEUED)
+                       goto out_unlock_inode;
+               /*
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -1882,6 +1882,10 @@ struct super_operations {
+  *                    wb stat updates to grab mapping->tree_lock.  See
+  *                    inode_switch_wb_work_fn() for details.
+  *
++ * I_SYNC_QUEUED      Inode is queued in b_io or b_more_io writeback lists.
++ *                    Used to detect that mark_inode_dirty() should not move
++ *                    inode between dirty lists.
++ *
+  * Q: What is the difference between I_WILL_FREE and I_FREEING?
+  */
+ #define I_DIRTY_SYNC          (1 << 0)
+@@ -1899,9 +1903,9 @@ struct super_operations {
+ #define I_DIO_WAKEUP          (1 << __I_DIO_WAKEUP)
+ #define I_LINKABLE            (1 << 10)
+ #define I_DIRTY_TIME          (1 << 11)
+-#define __I_DIRTY_TIME_EXPIRED        12
+-#define I_DIRTY_TIME_EXPIRED  (1 << __I_DIRTY_TIME_EXPIRED)
++#define I_DIRTY_TIME_EXPIRED  (1 << 12)
+ #define I_WB_SWITCH           (1 << 13)
++#define I_SYNC_QUEUED         (1 << 17)
+ #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
+ #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
diff --git a/queue-4.4/writeback-fix-sync-livelock-due-to-b_dirty_time-processing.patch b/queue-4.4/writeback-fix-sync-livelock-due-to-b_dirty_time-processing.patch
new file mode 100644 (file)
index 0000000..f6c13e8
--- /dev/null
@@ -0,0 +1,193 @@
+From f9cae926f35e8230330f28c7b743ad088611a8de Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Fri, 29 May 2020 16:08:58 +0200
+Subject: writeback: Fix sync livelock due to b_dirty_time processing
+
+From: Jan Kara <jack@suse.cz>
+
+commit f9cae926f35e8230330f28c7b743ad088611a8de upstream.
+
+When we are processing writeback for sync(2), move_expired_inodes()
+didn't set any inode expiry value (older_than_this). This can result in
+writeback never completing if there's steady stream of inodes added to
+b_dirty_time list as writeback rechecks dirty lists after each writeback
+round whether there's more work to be done. Fix the problem by using
+sync(2) start time is inode expiry value when processing b_dirty_time
+list similarly as for ordinarily dirtied inodes. This requires some
+refactoring of older_than_this handling which simplifies the code
+noticeably as a bonus.
+
+Fixes: 0ae45f63d4ef ("vfs: add support for a lazytime mount option")
+CC: stable@vger.kernel.org
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/fs-writeback.c                |   44 +++++++++++++++------------------------
+ include/trace/events/writeback.h |   13 +++++------
+ 2 files changed, 23 insertions(+), 34 deletions(-)
+
+--- a/fs/fs-writeback.c
++++ b/fs/fs-writeback.c
+@@ -45,7 +45,6 @@ struct wb_completion {
+ struct wb_writeback_work {
+       long nr_pages;
+       struct super_block *sb;
+-      unsigned long *older_than_this;
+       enum writeback_sync_modes sync_mode;
+       unsigned int tagged_writepages:1;
+       unsigned int for_kupdate:1;
+@@ -1109,16 +1108,13 @@ static bool inode_dirtied_after(struct i
+ #define EXPIRE_DIRTY_ATIME 0x0001
+ /*
+- * Move expired (dirtied before work->older_than_this) dirty inodes from
++ * Move expired (dirtied before dirtied_before) dirty inodes from
+  * @delaying_queue to @dispatch_queue.
+  */
+ static int move_expired_inodes(struct list_head *delaying_queue,
+                              struct list_head *dispatch_queue,
+-                             int flags,
+-                             struct wb_writeback_work *work)
++                             int flags, unsigned long dirtied_before)
+ {
+-      unsigned long *older_than_this = NULL;
+-      unsigned long expire_time;
+       LIST_HEAD(tmp);
+       struct list_head *pos, *node;
+       struct super_block *sb = NULL;
+@@ -1126,16 +1122,9 @@ static int move_expired_inodes(struct li
+       int do_sb_sort = 0;
+       int moved = 0;
+-      if ((flags & EXPIRE_DIRTY_ATIME) == 0)
+-              older_than_this = work->older_than_this;
+-      else if (!work->for_sync) {
+-              expire_time = jiffies - (dirtytime_expire_interval * HZ);
+-              older_than_this = &expire_time;
+-      }
+       while (!list_empty(delaying_queue)) {
+               inode = wb_inode(delaying_queue->prev);
+-              if (older_than_this &&
+-                  inode_dirtied_after(inode, *older_than_this))
++              if (inode_dirtied_after(inode, dirtied_before))
+                       break;
+               list_move(&inode->i_io_list, &tmp);
+               moved++;
+@@ -1181,18 +1170,22 @@ out:
+  *                                           |
+  *                                           +--> dequeue for IO
+  */
+-static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
++static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work,
++                   unsigned long dirtied_before)
+ {
+       int moved;
++      unsigned long time_expire_jif = dirtied_before;
+       assert_spin_locked(&wb->list_lock);
+       list_splice_init(&wb->b_more_io, &wb->b_io);
+-      moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
++      moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, dirtied_before);
++      if (!work->for_sync)
++              time_expire_jif = jiffies - dirtytime_expire_interval * HZ;
+       moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
+-                                   EXPIRE_DIRTY_ATIME, work);
++                                   EXPIRE_DIRTY_ATIME, time_expire_jif);
+       if (moved)
+               wb_io_lists_populated(wb);
+-      trace_writeback_queue_io(wb, work, moved);
++      trace_writeback_queue_io(wb, work, dirtied_before, moved);
+ }
+ static int write_inode(struct inode *inode, struct writeback_control *wbc)
+@@ -1703,7 +1696,7 @@ static long writeback_inodes_wb(struct b
+       blk_start_plug(&plug);
+       spin_lock(&wb->list_lock);
+       if (list_empty(&wb->b_io))
+-              queue_io(wb, &work);
++              queue_io(wb, &work, jiffies);
+       __writeback_inodes_wb(wb, &work);
+       spin_unlock(&wb->list_lock);
+       blk_finish_plug(&plug);
+@@ -1723,7 +1716,7 @@ static long writeback_inodes_wb(struct b
+  * takes longer than a dirty_writeback_interval interval, then leave a
+  * one-second gap.
+  *
+- * older_than_this takes precedence over nr_to_write.  So we'll only write back
++ * dirtied_before takes precedence over nr_to_write.  So we'll only write back
+  * all dirty pages if they are all attached to "old" mappings.
+  */
+ static long wb_writeback(struct bdi_writeback *wb,
+@@ -1731,14 +1724,11 @@ static long wb_writeback(struct bdi_writ
+ {
+       unsigned long wb_start = jiffies;
+       long nr_pages = work->nr_pages;
+-      unsigned long oldest_jif;
++      unsigned long dirtied_before = jiffies;
+       struct inode *inode;
+       long progress;
+       struct blk_plug plug;
+-      oldest_jif = jiffies;
+-      work->older_than_this = &oldest_jif;
+-
+       blk_start_plug(&plug);
+       spin_lock(&wb->list_lock);
+       for (;;) {
+@@ -1772,14 +1762,14 @@ static long wb_writeback(struct bdi_writ
+                * safe.
+                */
+               if (work->for_kupdate) {
+-                      oldest_jif = jiffies -
++                      dirtied_before = jiffies -
+                               msecs_to_jiffies(dirty_expire_interval * 10);
+               } else if (work->for_background)
+-                      oldest_jif = jiffies;
++                      dirtied_before = jiffies;
+               trace_writeback_start(wb, work);
+               if (list_empty(&wb->b_io))
+-                      queue_io(wb, work);
++                      queue_io(wb, work, dirtied_before);
+               if (work->sb)
+                       progress = writeback_sb_inodes(work->sb, wb, work);
+               else
+--- a/include/trace/events/writeback.h
++++ b/include/trace/events/writeback.h
+@@ -390,8 +390,9 @@ DEFINE_WBC_EVENT(wbc_writepage);
+ TRACE_EVENT(writeback_queue_io,
+       TP_PROTO(struct bdi_writeback *wb,
+                struct wb_writeback_work *work,
++               unsigned long dirtied_before,
+                int moved),
+-      TP_ARGS(wb, work, moved),
++      TP_ARGS(wb, work, dirtied_before, moved),
+       TP_STRUCT__entry(
+               __array(char,           name, 32)
+               __field(unsigned long,  older)
+@@ -401,19 +402,17 @@ TRACE_EVENT(writeback_queue_io,
+               __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
+       ),
+       TP_fast_assign(
+-              unsigned long *older_than_this = work->older_than_this;
+               strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
+-              __entry->older  = older_than_this ?  *older_than_this : 0;
+-              __entry->age    = older_than_this ?
+-                                (jiffies - *older_than_this) * 1000 / HZ : -1;
++              __entry->older  = dirtied_before;
++              __entry->age    = (jiffies - dirtied_before) * 1000 / HZ;
+               __entry->moved  = moved;
+               __entry->reason = work->reason;
+               __trace_wb_assign_cgroup(__get_str(cgroup), wb);
+       ),
+       TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup=%s",
+               __entry->name,
+-              __entry->older, /* older_than_this in jiffies */
+-              __entry->age,   /* older_than_this in relative milliseconds */
++              __entry->older, /* dirtied_before in jiffies */
++              __entry->age,   /* dirtied_before in relative milliseconds */
+               __entry->moved,
+               __print_symbolic(__entry->reason, WB_WORK_REASON),
+               __get_str(cgroup)
diff --git a/queue-4.4/writeback-protect-inode-i_io_list-with-inode-i_lock.patch b/queue-4.4/writeback-protect-inode-i_io_list-with-inode-i_lock.patch
new file mode 100644 (file)
index 0000000..297a700
--- /dev/null
@@ -0,0 +1,107 @@
+From b35250c0816c7cf7d0a8de92f5fafb6a7508a708 Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Wed, 10 Jun 2020 17:36:03 +0200
+Subject: writeback: Protect inode->i_io_list with inode->i_lock
+
+From: Jan Kara <jack@suse.cz>
+
+commit b35250c0816c7cf7d0a8de92f5fafb6a7508a708 upstream.
+
+Currently, operations on inode->i_io_list are protected by
+wb->list_lock. In the following patches we'll need to maintain
+consistency between inode->i_state and inode->i_io_list so change the
+code so that inode->i_lock protects also all inode's i_io_list handling.
+
+Reviewed-by: Martijn Coenen <maco@android.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+CC: stable@vger.kernel.org # Prerequisite for "writeback: Avoid skipping inode writeback"
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/fs-writeback.c |   22 +++++++++++++++++-----
+ 1 file changed, 17 insertions(+), 5 deletions(-)
+
+--- a/fs/fs-writeback.c
++++ b/fs/fs-writeback.c
+@@ -160,6 +160,7 @@ static void inode_io_list_del_locked(str
+                                    struct bdi_writeback *wb)
+ {
+       assert_spin_locked(&wb->list_lock);
++      assert_spin_locked(&inode->i_lock);
+       list_del_init(&inode->i_io_list);
+       wb_io_lists_depopulated(wb);
+@@ -1034,7 +1035,9 @@ void inode_io_list_del(struct inode *ino
+       struct bdi_writeback *wb;
+       wb = inode_to_wb_and_lock_list(inode);
++      spin_lock(&inode->i_lock);
+       inode_io_list_del_locked(inode, wb);
++      spin_unlock(&inode->i_lock);
+       spin_unlock(&wb->list_lock);
+ }
+@@ -1047,8 +1050,10 @@ void inode_io_list_del(struct inode *ino
+  * the case then the inode must have been redirtied while it was being written
+  * out and we don't reset its dirtied_when.
+  */
+-static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
++static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
+ {
++      assert_spin_locked(&inode->i_lock);
++
+       if (!list_empty(&wb->b_dirty)) {
+               struct inode *tail;
+@@ -1059,6 +1064,13 @@ static void redirty_tail(struct inode *i
+       inode_io_list_move_locked(inode, wb, &wb->b_dirty);
+ }
++static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
++{
++      spin_lock(&inode->i_lock);
++      redirty_tail_locked(inode, wb);
++      spin_unlock(&inode->i_lock);
++}
++
+ /*
+  * requeue inode for re-scanning after bdi->b_io list is exhausted.
+  */
+@@ -1269,7 +1281,7 @@ static void requeue_inode(struct inode *
+                * writeback is not making progress due to locked
+                * buffers. Skip this inode for now.
+                */
+-              redirty_tail(inode, wb);
++              redirty_tail_locked(inode, wb);
+               return;
+       }
+@@ -1289,7 +1301,7 @@ static void requeue_inode(struct inode *
+                        * retrying writeback of the dirty page/inode
+                        * that cannot be performed immediately.
+                        */
+-                      redirty_tail(inode, wb);
++                      redirty_tail_locked(inode, wb);
+               }
+       } else if (inode->i_state & I_DIRTY) {
+               /*
+@@ -1297,7 +1309,7 @@ static void requeue_inode(struct inode *
+                * such as delayed allocation during submission or metadata
+                * updates after data IO completion.
+                */
+-              redirty_tail(inode, wb);
++              redirty_tail_locked(inode, wb);
+       } else if (inode->i_state & I_DIRTY_TIME) {
+               inode->dirtied_when = jiffies;
+               inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
+@@ -1543,8 +1555,8 @@ static long writeback_sb_inodes(struct s
+                */
+               spin_lock(&inode->i_lock);
+               if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
++                      redirty_tail_locked(inode, wb);
+                       spin_unlock(&inode->i_lock);
+-                      redirty_tail(inode, wb);
+                       continue;
+               }
+               if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {