src/patches/suse-2.6.27.31/patches.fixes/ext3_false_EIO_fix.diff

   1 From: Jan Kara <jack@suse.cz>
   2 Subject: [PATCH] ext3: Avoid false EIO errors
   3 References: bnc#479730
   4
   5 Sometimes block_write_begin() can map buffers in a page but later we fail to
   6 copy data into those buffers (because the source page has been paged out in the
   7 mean time). We then end up with !uptodate mapped buffers. To add a bit more to
   8 the confusion, block_write_end() does not commit any data (and thus does not
   9 any mark buffers as uptodate) if we didn't succeed with copying all the data.
  10
  11 Commit f4fc66a894546bdc88a775d0e83ad20a65210bcb (ext3: convert to new aops)
  12 missed these cases and thus we were inserting non-uptodate buffers to
  13 transaction's list which confuses JBD code and it reports IO errors, aborts
  14 a transaction and generally makes users afraid about their data ;-P.
  15
  16 This patch fixes the problem by reorganizing ext3_..._write_end() code to
  17 first call block_write_end() to mark buffers with valid data uptodate and
  18 after that we file only uptodate buffers to transaction's lists. Also
  19 fix a problem where we could leave blocks allocated beyond i_size (i_disksize
  20 in fact).
  21
  22 Signed-off-by: Jan Kara <jack@suse.cz>
  23
  24 ---
  25  fs/ext3/inode.c |   99 +++++++++++++++++++++++---------------------------------
  26  1 file changed, 42 insertions(+), 57 deletions(-)
  27
  28 --- a/fs/ext3/inode.c
  29 +++ b/fs/ext3/inode.c
  30 @@ -1195,6 +1195,18 @@ int ext3_journal_dirty_data(handle_t *ha
  31         return err;
  32  }
  33
  34 +/* For ordered writepage and write_end functions */
  35 +static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
  36 +{
  37 +       /*
  38 +        * Write could have mapped the buffer but it didn't copy the data in
  39 +        * yet. So avoid filing such buffer into a transaction.
  40 +        */
  41 +       if (buffer_mapped(bh) && buffer_uptodate(bh))
  42 +               return ext3_journal_dirty_data(handle, bh);
  43 +       return 0;
  44 +}
  45 +
  46  /* For write_end() in data=journal mode */
  47  static int write_end_fn(handle_t *handle, struct buffer_head *bh)
  48  {
  49 @@ -1205,26 +1217,29 @@ static int write_end_fn(handle_t *handle
  50  }
  51
  52  /*
  53 - * Generic write_end handler for ordered and writeback ext3 journal modes.
  54 - * We can't use generic_write_end, because that unlocks the page and we need to
  55 - * unlock the page after ext3_journal_stop, but ext3_journal_stop must run
  56 - * after block_write_end.
  57 + * This is nasty and subtle: ext3_write_begin() could have allocated blocks
  58 + * for the whole page but later we failed to copy the data in. So the disk
  59 + * size we really have allocated is pos + len (block_write_end() has zeroed
  60 + * the freshly allocated buffers so we aren't going to write garbage). But we
  61 + * want to keep i_size at the place where data copying finished so that we
  62 + * don't confuse readers. The worst what can happen is that we expose a page
  63 + * of zeros at the end of file after a crash...
  64   */
  65 -static int ext3_generic_write_end(struct file *file,
  66 -                               struct address_space *mapping,
  67 -                               loff_t pos, unsigned len, unsigned copied,
  68 -                               struct page *page, void *fsdata)
  69 +static void update_file_sizes(struct inode *inode, loff_t pos, unsigned len,
  70 +                             unsigned copied)
  71  {
  72 -       struct inode *inode = file->f_mapping->host;
  73 +       int mark_dirty = 0;
  74
  75 -       copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
  76 -
  77 -       if (pos+copied > inode->i_size) {
  78 -               i_size_write(inode, pos+copied);
  79 -               mark_inode_dirty(inode);
  80 +       if (pos + len > EXT3_I(inode)->i_disksize) {
  81 +               mark_dirty = 1;
  82 +               EXT3_I(inode)->i_disksize = pos + len;
  83         }
  84 -
  85 -       return copied;
  86 +       if (pos + copied > inode->i_size) {
  87 +               i_size_write(inode, pos + copied);
  88 +               mark_dirty = 1;
  89 +       }
  90 +       if (mark_dirty)
  91 +               mark_inode_dirty(inode);
  92  }
  93
  94  /*
  95 @@ -1244,29 +1259,17 @@ static int ext3_ordered_write_end(struct
  96         unsigned from, to;
  97         int ret = 0, ret2;
  98
  99 +       copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 100 +
 101 +       /* See comment at update_file_sizes() for why we check buffers upto
 102 +        * from + len */
 103         from = pos & (PAGE_CACHE_SIZE - 1);
 104         to = from + len;
 105 -
 106         ret = walk_page_buffers(handle, page_buffers(page),
 107 -               from, to, NULL, ext3_journal_dirty_data);
 108 +               from, to, NULL, journal_dirty_data_fn);
 109
 110 -       if (ret == 0) {
 111 -               /*
 112 -                * generic_write_end() will run mark_inode_dirty() if i_size
 113 -                * changes.  So let's piggyback the i_disksize mark_inode_dirty
 114 -                * into that.
 115 -                */
 116 -               loff_t new_i_size;
 117 -
 118 -               new_i_size = pos + copied;
 119 -               if (new_i_size > EXT3_I(inode)->i_disksize)
 120 -                       EXT3_I(inode)->i_disksize = new_i_size;
 121 -               ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
 122 -                                                       page, fsdata);
 123 -               copied = ret2;
 124 -               if (ret2 < 0)
 125 -                       ret = ret2;
 126 -       }
 127 +       if (ret == 0)
 128 +               update_file_sizes(inode, pos, len, copied);
 129         ret2 = ext3_journal_stop(handle);
 130         if (!ret)
 131                 ret = ret2;
 132 @@ -1283,22 +1286,11 @@ static int ext3_writeback_write_end(stru
 133  {
 134         handle_t *handle = ext3_journal_current_handle();
 135         struct inode *inode = file->f_mapping->host;
 136 -       int ret = 0, ret2;
 137 -       loff_t new_i_size;
 138 +       int ret;
 139
 140 -       new_i_size = pos + copied;
 141 -       if (new_i_size > EXT3_I(inode)->i_disksize)
 142 -               EXT3_I(inode)->i_disksize = new_i_size;
 143 -
 144 -       ret2 = ext3_generic_write_end(file, mapping, pos, len, copied,
 145 -                                                       page, fsdata);
 146 -       copied = ret2;
 147 -       if (ret2 < 0)
 148 -               ret = ret2;
 149 -
 150 -       ret2 = ext3_journal_stop(handle);
 151 -       if (!ret)
 152 -               ret = ret2;
 153 +       copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
 154 +       update_file_sizes(inode, pos, len, copied);
 155 +       ret = ext3_journal_stop(handle);
 156         unlock_page(page);
 157         page_cache_release(page);
 158
 159 @@ -1412,13 +1404,6 @@ static int bput_one(handle_t *handle, st
 160         return 0;
 161  }
 162
 163 -static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
 164 -{
 165 -       if (buffer_mapped(bh))
 166 -               return ext3_journal_dirty_data(handle, bh);
 167 -       return 0;
 168 -}
 169 -
 170  /*
 171   * Note that we always start a transaction even if we're not journalling
 172   * data.  This is to preserve ordering: any hole instantiation within