releases/3.6.2/ext4-fix-potential-deadlock-in-ext4_nonda_switch.patch

   1 From 00d4e7362ed01987183e9528295de3213031309c Mon Sep 17 00:00:00 2001
   2 From: Theodore Ts'o <tytso@mit.edu>
   3 Date: Wed, 19 Sep 2012 22:42:36 -0400
   4 Subject: ext4: fix potential deadlock in ext4_nonda_switch()
   5
   6 From: Theodore Ts'o <tytso@mit.edu>
   7
   8 commit 00d4e7362ed01987183e9528295de3213031309c upstream.
   9
  10 In ext4_nonda_switch(), if the file system is getting full we used to
  11 call writeback_inodes_sb_if_idle().  The problem is that we can be
  12 holding i_mutex already, and this causes a potential deadlock when
  13 writeback_inodes_sb_if_idle() when it tries to take s_umount.  (See
  14 lockdep output below).
  15
  16 As it turns out we don't need need to hold s_umount; the fact that we
  17 are in the middle of the write(2) system call will keep the superblock
  18 pinned.  Unfortunately writeback_inodes_sb() checks to make sure
  19 s_umount is taken, and the VFS uses a different mechanism for making
  20 sure the file system doesn't get unmounted out from under us.  The
  21 simplest way of dealing with this is to just simply grab s_umount
  22 using a trylock, and skip kicking the writeback flusher thread in the
  23 very unlikely case that we can't take a read lock on s_umount without
  24 blocking.
  25
  26 Also, we now check the cirteria for kicking the writeback thread
  27 before we decide to whether to fall back to non-delayed writeback, so
  28 if there are any outstanding delayed allocation writes, we try to get
  29 them resolved as soon as possible.
  30
  31    [ INFO: possible circular locking dependency detected ]
  32    3.6.0-rc1-00042-gce894ca #367 Not tainted
  33    -------------------------------------------------------
  34    dd/8298 is trying to acquire lock:
  35     (&type->s_umount_key#18){++++..}, at: [<c02277d4>] writeback_inodes_sb_if_idle+0x28/0x46
  36
  37    but task is already holding lock:
  38     (&sb->s_type->i_mutex_key#8){+.+...}, at: [<c01ddcce>] generic_file_aio_write+0x5f/0xd3
  39
  40    which lock already depends on the new lock.
  41
  42    2 locks held by dd/8298:
  43     #0:  (sb_writers#2){.+.+.+}, at: [<c01ddcc5>] generic_file_aio_write+0x56/0xd3
  44     #1:  (&sb->s_type->i_mutex_key#8){+.+...}, at: [<c01ddcce>] generic_file_aio_write+0x5f/0xd3
  45
  46    stack backtrace:
  47    Pid: 8298, comm: dd Not tainted 3.6.0-rc1-00042-gce894ca #367
  48    Call Trace:
  49     [<c015b79c>] ? console_unlock+0x345/0x372
  50     [<c06d62a1>] print_circular_bug+0x190/0x19d
  51     [<c019906c>] __lock_acquire+0x86d/0xb6c
  52     [<c01999db>] ? mark_held_locks+0x5c/0x7b
  53     [<c0199724>] lock_acquire+0x66/0xb9
  54     [<c02277d4>] ? writeback_inodes_sb_if_idle+0x28/0x46
  55     [<c06db935>] down_read+0x28/0x58
  56     [<c02277d4>] ? writeback_inodes_sb_if_idle+0x28/0x46
  57     [<c02277d4>] writeback_inodes_sb_if_idle+0x28/0x46
  58     [<c026f3b2>] ext4_nonda_switch+0xe1/0xf4
  59     [<c0271ece>] ext4_da_write_begin+0x27/0x193
  60     [<c01dcdb0>] generic_file_buffered_write+0xc8/0x1bb
  61     [<c01ddc47>] __generic_file_aio_write+0x1dd/0x205
  62     [<c01ddce7>] generic_file_aio_write+0x78/0xd3
  63     [<c026d336>] ext4_file_write+0x480/0x4a6
  64     [<c0198c1d>] ? __lock_acquire+0x41e/0xb6c
  65     [<c0180944>] ? sched_clock_cpu+0x11a/0x13e
  66     [<c01967e9>] ? trace_hardirqs_off+0xb/0xd
  67     [<c018099f>] ? local_clock+0x37/0x4e
  68     [<c0209f2c>] do_sync_write+0x67/0x9d
  69     [<c0209ec5>] ? wait_on_retry_sync_kiocb+0x44/0x44
  70     [<c020a7b9>] vfs_write+0x7b/0xe6
  71     [<c020a9a6>] sys_write+0x3b/0x64
  72     [<c06dd4bd>] syscall_call+0x7/0xb
  73
  74 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
  75 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
  76
  77 ---
  78  fs/ext4/inode.c   |   17 ++++++++++-------
  79  fs/fs-writeback.c |    1 +
  80  2 files changed, 11 insertions(+), 7 deletions(-)
  81
  82 --- a/fs/ext4/inode.c
  83 +++ b/fs/ext4/inode.c
  84 @@ -2463,6 +2463,16 @@ static int ext4_nonda_switch(struct supe
  85         free_blocks  = EXT4_C2B(sbi,
  86                 percpu_counter_read_positive(&sbi->s_freeclusters_counter));
  87         dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
  88 +       /*
  89 +        * Start pushing delalloc when 1/2 of free blocks are dirty.
  90 +        */
  91 +       if (dirty_blocks && (free_blocks < 2 * dirty_blocks) &&
  92 +           !writeback_in_progress(sb->s_bdi) &&
  93 +           down_read_trylock(&sb->s_umount)) {
  94 +               writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
  95 +               up_read(&sb->s_umount);
  96 +       }
  97 +
  98         if (2 * free_blocks < 3 * dirty_blocks ||
  99                 free_blocks < (dirty_blocks + EXT4_FREECLUSTERS_WATERMARK)) {
 100                 /*
 101 @@ -2471,13 +2481,6 @@ static int ext4_nonda_switch(struct supe
 102                  */
 103                 return 1;
 104         }
 105 -       /*
 106 -        * Even if we don't switch but are nearing capacity,
 107 -        * start pushing delalloc when 1/2 of free blocks are dirty.
 108 -        */
 109 -       if (free_blocks < 2 * dirty_blocks)
 110 -               writeback_inodes_sb_if_idle(sb, WB_REASON_FS_FREE_SPACE);
 111 -
 112         return 0;
 113  }
 114
 115 --- a/fs/fs-writeback.c
 116 +++ b/fs/fs-writeback.c
 117 @@ -63,6 +63,7 @@ int writeback_in_progress(struct backing
 118  {
 119         return test_bit(BDI_writeback_running, &bdi->state);
 120  }
 121 +EXPORT_SYMBOL(writeback_in_progress);
 122
 123  static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
 124  {