]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob - queue-4.14/writeback-synchronize-sync-2-against-cgroup-writebac.patch
Linux 4.9.162
[thirdparty/kernel/stable-queue.git] / queue-4.14 / writeback-synchronize-sync-2-against-cgroup-writebac.patch
1 From 21442b247263a8ddea8a899eac40eefdf6a6decd Mon Sep 17 00:00:00 2001
2 From: Tejun Heo <tj@kernel.org>
3 Date: Tue, 12 Dec 2017 08:38:30 -0800
4 Subject: writeback: synchronize sync(2) against cgroup writeback membership
5 switches
6
7 [ Upstream commit 7fc5854f8c6efae9e7624970ab49a1eac2faefb1 ]
8
9 sync_inodes_sb() can race against cgwb (cgroup writeback) membership
10 switches and fail to writeback some inodes. For example, if an inode
11 switches to another wb while sync_inodes_sb() is in progress, the new
12 wb might not be visible to bdi_split_work_to_wbs() at all or the inode
13 might jump from a wb which hasn't issued writebacks yet to one which
14 already has.
15
16 This patch adds backing_dev_info->wb_switch_rwsem to synchronize cgwb
17 switch path against sync_inodes_sb() so that sync_inodes_sb() is
18 guaranteed to see all the target wbs and inodes can't jump wbs to
19 escape syncing.
20
21 v2: Fixed misplaced rwsem init. Spotted by Jiufei.
22
23 Signed-off-by: Tejun Heo <tj@kernel.org>
24 Reported-by: Jiufei Xue <xuejiufei@gmail.com>
25 Link: http://lkml.kernel.org/r/dc694ae2-f07f-61e1-7097-7c8411cee12d@gmail.com
26 Acked-by: Jan Kara <jack@suse.cz>
27 Signed-off-by: Jens Axboe <axboe@kernel.dk>
28 Signed-off-by: Sasha Levin <sashal@kernel.org>
29 ---
30 fs/fs-writeback.c | 40 ++++++++++++++++++++++++++++++--
31 include/linux/backing-dev-defs.h | 1 +
32 mm/backing-dev.c | 1 +
33 3 files changed, 40 insertions(+), 2 deletions(-)
34
35 diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
36 index 3244932f4d5cc..6a76616c9401b 100644
37 --- a/fs/fs-writeback.c
38 +++ b/fs/fs-writeback.c
39 @@ -331,11 +331,22 @@ struct inode_switch_wbs_context {
40 struct work_struct work;
41 };
42
43 +static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
44 +{
45 + down_write(&bdi->wb_switch_rwsem);
46 +}
47 +
48 +static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
49 +{
50 + up_write(&bdi->wb_switch_rwsem);
51 +}
52 +
53 static void inode_switch_wbs_work_fn(struct work_struct *work)
54 {
55 struct inode_switch_wbs_context *isw =
56 container_of(work, struct inode_switch_wbs_context, work);
57 struct inode *inode = isw->inode;
58 + struct backing_dev_info *bdi = inode_to_bdi(inode);
59 struct address_space *mapping = inode->i_mapping;
60 struct bdi_writeback *old_wb = inode->i_wb;
61 struct bdi_writeback *new_wb = isw->new_wb;
62 @@ -343,6 +354,12 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
63 bool switched = false;
64 void **slot;
65
66 + /*
67 + * If @inode switches cgwb membership while sync_inodes_sb() is
68 + * being issued, sync_inodes_sb() might miss it. Synchronize.
69 + */
70 + down_read(&bdi->wb_switch_rwsem);
71 +
72 /*
73 * By the time control reaches here, RCU grace period has passed
74 * since I_WB_SWITCH assertion and all wb stat update transactions
75 @@ -435,6 +452,8 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
76 spin_unlock(&new_wb->list_lock);
77 spin_unlock(&old_wb->list_lock);
78
79 + up_read(&bdi->wb_switch_rwsem);
80 +
81 if (switched) {
82 wb_wakeup(new_wb);
83 wb_put(old_wb);
84 @@ -475,9 +494,18 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
85 if (inode->i_state & I_WB_SWITCH)
86 return;
87
88 + /*
89 + * Avoid starting new switches while sync_inodes_sb() is in
90 + * progress. Otherwise, if the down_write protected issue path
91 + * blocks heavily, we might end up starting a large number of
92 + * switches which will block on the rwsem.
93 + */
94 + if (!down_read_trylock(&bdi->wb_switch_rwsem))
95 + return;
96 +
97 isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
98 if (!isw)
99 - return;
100 + goto out_unlock;
101
102 /* find and pin the new wb */
103 rcu_read_lock();
104 @@ -511,12 +539,14 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
105 * Let's continue after I_WB_SWITCH is guaranteed to be visible.
106 */
107 call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
108 - return;
109 + goto out_unlock;
110
111 out_free:
112 if (isw->new_wb)
113 wb_put(isw->new_wb);
114 kfree(isw);
115 +out_unlock:
116 + up_read(&bdi->wb_switch_rwsem);
117 }
118
119 /**
120 @@ -894,6 +924,9 @@ fs_initcall(cgroup_writeback_init);
121
122 #else /* CONFIG_CGROUP_WRITEBACK */
123
124 +static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
125 +static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
126 +
127 static struct bdi_writeback *
128 locked_inode_to_wb_and_lock_list(struct inode *inode)
129 __releases(&inode->i_lock)
130 @@ -2408,8 +2441,11 @@ void sync_inodes_sb(struct super_block *sb)
131 return;
132 WARN_ON(!rwsem_is_locked(&sb->s_umount));
133
134 + /* protect against inode wb switch, see inode_switch_wbs_work_fn() */
135 + bdi_down_write_wb_switch_rwsem(bdi);
136 bdi_split_work_to_wbs(bdi, &work, false);
137 wb_wait_for_completion(bdi, &done);
138 + bdi_up_write_wb_switch_rwsem(bdi);
139
140 wait_sb_inodes(sb);
141 }
142 diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
143 index 19240379637fe..b186c4b464e02 100644
144 --- a/include/linux/backing-dev-defs.h
145 +++ b/include/linux/backing-dev-defs.h
146 @@ -165,6 +165,7 @@ struct backing_dev_info {
147 struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
148 struct rb_root cgwb_congested_tree; /* their congested states */
149 struct mutex cgwb_release_mutex; /* protect shutdown of wb structs */
150 + struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */
151 #else
152 struct bdi_writeback_congested *wb_congested;
153 #endif
154 diff --git a/mm/backing-dev.c b/mm/backing-dev.c
155 index 9386c98dac123..6fa31754eadd9 100644
156 --- a/mm/backing-dev.c
157 +++ b/mm/backing-dev.c
158 @@ -684,6 +684,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
159 INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
160 bdi->cgwb_congested_tree = RB_ROOT;
161 mutex_init(&bdi->cgwb_release_mutex);
162 + init_rwsem(&bdi->wb_switch_rwsem);
163
164 ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
165 if (!ret) {
166 --
167 2.19.1
168