1 From ebe71d4fcb5cad29134efb77a36b11a546616104 Mon Sep 17 00:00:00 2001
2 From: Jan Kara <jack@suse.cz>
3 Date: Tue, 8 Dec 2009 23:51:10 -0500
4 Subject: [PATCH 28/30] ext4: Wait for proper transaction commit on fsync
6 (cherry picked from commit b436b9bef84de6893e86346d8fbf7104bc520645)
8 We cannot rely on buffer dirty bits during fsync because pdflush can come
9 before fsync is called and clear dirty bits without forcing a transaction
10 commit. What we do is that we track which transaction has last changed
11 the inode and which transaction last changed allocation and force it to
14 Signed-off-by: Jan Kara <jack@suse.cz>
15 Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
16 Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
18 fs/ext4/ext4.h | 7 +++++++
19 fs/ext4/ext4_jbd2.h | 13 +++++++++++++
20 fs/ext4/extents.c | 14 ++++++++++++--
21 fs/ext4/fsync.c | 46 +++++++++++++++++-----------------------------
22 fs/ext4/inode.c | 29 +++++++++++++++++++++++++++++
23 fs/ext4/super.c | 2 ++
24 fs/jbd2/journal.c | 1 +
25 7 files changed, 81 insertions(+), 31 deletions(-)
29 @@ -703,6 +703,13 @@ struct ext4_inode_info {
30 struct list_head i_aio_dio_complete_list;
31 /* current io_end structure for async DIO write*/
32 ext4_io_end_t *cur_aio_dio;
35 + * Transactions that contain inode's metadata needed to complete
36 + * fsync and fdatasync, respectively.
39 + tid_t i_datasync_tid;
43 --- a/fs/ext4/ext4_jbd2.h
44 +++ b/fs/ext4/ext4_jbd2.h
45 @@ -258,6 +258,19 @@ static inline int ext4_jbd2_file_inode(h
49 +static inline void ext4_update_inode_fsync_trans(handle_t *handle,
50 + struct inode *inode,
53 + struct ext4_inode_info *ei = EXT4_I(inode);
55 + if (ext4_handle_valid(handle)) {
56 + ei->i_sync_tid = handle->h_transaction->t_tid;
58 + ei->i_datasync_tid = handle->h_transaction->t_tid;
63 int ext4_force_commit(struct super_block *sb);
65 --- a/fs/ext4/extents.c
66 +++ b/fs/ext4/extents.c
67 @@ -3064,6 +3064,8 @@ ext4_ext_handle_uninitialized_extents(ha
68 if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
69 ret = ext4_convert_unwritten_extents_dio(handle, inode,
72 + ext4_update_inode_fsync_trans(handle, inode, 1);
75 /* buffered IO case */
76 @@ -3091,6 +3093,8 @@ ext4_ext_handle_uninitialized_extents(ha
77 ret = ext4_ext_convert_to_initialized(handle, inode,
81 + ext4_update_inode_fsync_trans(handle, inode, 1);
85 @@ -3329,10 +3333,16 @@ int ext4_ext_get_blocks(handle_t *handle
86 allocated = ext4_ext_get_actual_len(&newex);
87 set_buffer_new(bh_result);
89 - /* Cache only when it is _not_ an uninitialized extent */
90 - if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
92 + * Cache the extent and update transaction to commit on fdatasync only
93 + * when it is _not_ an uninitialized extent.
95 + if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
96 ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
97 EXT4_EXT_CACHE_EXTENT);
98 + ext4_update_inode_fsync_trans(handle, inode, 1);
100 + ext4_update_inode_fsync_trans(handle, inode, 0);
102 if (allocated > max_blocks)
103 allocated = max_blocks;
104 --- a/fs/ext4/fsync.c
105 +++ b/fs/ext4/fsync.c
107 int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
109 struct inode *inode = dentry->d_inode;
110 + struct ext4_inode_info *ei = EXT4_I(inode);
111 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
116 J_ASSERT(ext4_journal_current_handle() == NULL);
118 trace_ext4_sync_file(file, dentry, datasync);
120 + if (inode->i_sb->s_flags & MS_RDONLY)
123 ret = flush_aio_dio_completed_IO(inode);
128 + return simple_fsync(file, dentry, datasync);
132 + * data=writeback,ordered:
133 * The caller's filemap_fdatawrite()/wait will sync the data.
134 - * sync_inode() will sync the metadata
137 - * The caller's filemap_fdatawrite() will write the data and
138 - * sync_inode() will write the inode if it is dirty. Then the caller's
139 - * filemap_fdatawait() will wait on the pages.
140 + * Metadata is in the journal, we wait for proper transaction to
144 * filemap_fdatawrite won't do anything (the buffers are clean).
145 @@ -82,27 +87,10 @@ int ext4_sync_file(struct file *file, st
146 if (ext4_should_journal_data(inode))
147 return ext4_force_commit(inode->i_sb);
150 - ret = sync_mapping_buffers(inode->i_mapping);
152 - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
156 - * The VFS has written the file data. If the inode is unaltered
157 - * then we need not start a commit.
159 - if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
160 - struct writeback_control wbc = {
161 - .sync_mode = WB_SYNC_ALL,
162 - .nr_to_write = 0, /* sys_fsync did this */
164 - err = sync_inode(inode, &wbc);
169 - if (journal && (journal->j_flags & JBD2_BARRIER))
170 + commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
171 + if (jbd2_log_start_commit(journal, commit_tid))
172 + jbd2_log_wait_commit(journal, commit_tid);
173 + else if (journal->j_flags & JBD2_BARRIER)
174 blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
177 --- a/fs/ext4/inode.c
178 +++ b/fs/ext4/inode.c
179 @@ -1025,6 +1025,8 @@ static int ext4_ind_get_blocks(handle_t
182 set_buffer_new(bh_result);
184 + ext4_update_inode_fsync_trans(handle, inode, 1);
186 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
187 if (count > blocks_to_boundary)
188 @@ -4794,6 +4796,7 @@ struct inode *ext4_iget(struct super_blo
189 struct ext4_inode *raw_inode;
190 struct ext4_inode_info *ei;
192 + journal_t *journal = EXT4_SB(sb)->s_journal;
196 @@ -4858,6 +4861,31 @@ struct inode *ext4_iget(struct super_blo
197 ei->i_data[block] = raw_inode->i_block[block];
198 INIT_LIST_HEAD(&ei->i_orphan);
201 + * Set transaction id's of transactions that have to be committed
202 + * to finish f[data]sync. We set them to currently running transaction
203 + * as we cannot be sure that the inode or some of its metadata isn't
204 + * part of the transaction - the inode could have been reclaimed and
205 + * now it is reread from disk.
208 + transaction_t *transaction;
211 + spin_lock(&journal->j_state_lock);
212 + if (journal->j_running_transaction)
213 + transaction = journal->j_running_transaction;
215 + transaction = journal->j_committing_transaction;
217 + tid = transaction->t_tid;
219 + tid = journal->j_commit_sequence;
220 + spin_unlock(&journal->j_state_lock);
221 + ei->i_sync_tid = tid;
222 + ei->i_datasync_tid = tid;
225 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
226 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
227 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
228 @@ -5112,6 +5140,7 @@ static int ext4_do_update_inode(handle_t
230 ei->i_state &= ~EXT4_STATE_NEW;
232 + ext4_update_inode_fsync_trans(handle, inode, 0);
235 ext4_std_error(inode->i_sb, err);
236 --- a/fs/ext4/super.c
237 +++ b/fs/ext4/super.c
238 @@ -706,6 +706,8 @@ static struct inode *ext4_alloc_inode(st
239 spin_lock_init(&(ei->i_block_reservation_lock));
240 INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
241 ei->cur_aio_dio = NULL;
242 + ei->i_sync_tid = 0;
243 + ei->i_datasync_tid = 0;
245 return &ei->vfs_inode;
247 --- a/fs/jbd2/journal.c
248 +++ b/fs/jbd2/journal.c
249 @@ -78,6 +78,7 @@ EXPORT_SYMBOL(jbd2_journal_errno);
250 EXPORT_SYMBOL(jbd2_journal_ack_err);
251 EXPORT_SYMBOL(jbd2_journal_clear_err);
252 EXPORT_SYMBOL(jbd2_log_wait_commit);
253 +EXPORT_SYMBOL(jbd2_log_start_commit);
254 EXPORT_SYMBOL(jbd2_journal_start_commit);
255 EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
256 EXPORT_SYMBOL(jbd2_journal_wipe);