]>
Commit | Line | Data |
---|---|---|
d9b1f36d GKH |
1 | From b1dd693e5b9348bd68a80e679e03cf9c0973b01b Mon Sep 17 00:00:00 2001 |
2 | From: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> | |
3 | Date: Wed, 24 Nov 2010 12:57:06 -0800 | |
4 | Subject: memcg: avoid deadlock between move charge and try_charge() | |
5 | ||
6 | From: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> | |
7 | ||
8 | commit b1dd693e5b9348bd68a80e679e03cf9c0973b01b upstream. | |
9 | ||
10 | __mem_cgroup_try_charge() can be called under down_write(&mmap_sem)(e.g. | |
11 | mlock does it). This means it can cause deadlock if it races with move charge: | |
12 | ||
13 | Ex.1) | |
14 | move charge | try charge | |
15 | --------------------------------------+------------------------------ | |
16 | mem_cgroup_can_attach() | down_write(&mmap_sem) | |
17 | mc.moving_task = current | .. | |
18 | mem_cgroup_precharge_mc() | __mem_cgroup_try_charge() | |
19 | mem_cgroup_count_precharge() | prepare_to_wait() | |
20 | down_read(&mmap_sem) | if (mc.moving_task) | |
21 | -> cannot aquire the lock | -> true | |
22 | | schedule() | |
23 | ||
24 | Ex.2) | |
25 | move charge | try charge | |
26 | --------------------------------------+------------------------------ | |
27 | mem_cgroup_can_attach() | | |
28 | mc.moving_task = current | | |
29 | mem_cgroup_precharge_mc() | | |
30 | mem_cgroup_count_precharge() | | |
31 | down_read(&mmap_sem) | | |
32 | .. | | |
33 | up_read(&mmap_sem) | | |
34 | | down_write(&mmap_sem) | |
35 | mem_cgroup_move_task() | .. | |
36 | mem_cgroup_move_charge() | __mem_cgroup_try_charge() | |
37 | down_read(&mmap_sem) | prepare_to_wait() | |
38 | -> cannot aquire the lock | if (mc.moving_task) | |
39 | | -> true | |
40 | | schedule() | |
41 | ||
42 | To avoid this deadlock, we do all the move charge works (both can_attach() and | |
43 | attach()) under one mmap_sem section. | |
44 | And after this patch, we set/clear mc.moving_task outside mc.lock, because we | |
45 | use the lock only to check mc.from/to. | |
46 | ||
47 | Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> | |
48 | Cc: Balbir Singh <balbir@linux.vnet.ibm.com> | |
49 | Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | |
50 | Signed-off-by: Andrew Morton <akpm@linux-foundation.org> | |
51 | Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> | |
52 | Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de> | |
53 | ||
54 | --- | |
55 | mm/memcontrol.c | 43 ++++++++++++++++++++++++++----------------- | |
56 | 1 file changed, 26 insertions(+), 17 deletions(-) | |
57 | ||
58 | --- a/mm/memcontrol.c | |
59 | +++ b/mm/memcontrol.c | |
60 | @@ -269,13 +269,14 @@ enum move_type { | |
61 | ||
62 | /* "mc" and its members are protected by cgroup_mutex */ | |
63 | static struct move_charge_struct { | |
64 | - spinlock_t lock; /* for from, to, moving_task */ | |
65 | + spinlock_t lock; /* for from, to */ | |
66 | struct mem_cgroup *from; | |
67 | struct mem_cgroup *to; | |
68 | unsigned long precharge; | |
69 | unsigned long moved_charge; | |
70 | unsigned long moved_swap; | |
71 | struct task_struct *moving_task; /* a task moving charges */ | |
72 | + struct mm_struct *mm; | |
73 | wait_queue_head_t waitq; /* a waitq for other context */ | |
74 | } mc = { | |
75 | .lock = __SPIN_LOCK_UNLOCKED(mc.lock), | |
76 | @@ -4445,7 +4446,7 @@ static unsigned long mem_cgroup_count_pr | |
77 | unsigned long precharge; | |
78 | struct vm_area_struct *vma; | |
79 | ||
80 | - down_read(&mm->mmap_sem); | |
81 | + /* We've already held the mmap_sem */ | |
82 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | |
83 | struct mm_walk mem_cgroup_count_precharge_walk = { | |
84 | .pmd_entry = mem_cgroup_count_precharge_pte_range, | |
85 | @@ -4457,7 +4458,6 @@ static unsigned long mem_cgroup_count_pr | |
86 | walk_page_range(vma->vm_start, vma->vm_end, | |
87 | &mem_cgroup_count_precharge_walk); | |
88 | } | |
89 | - up_read(&mm->mmap_sem); | |
90 | ||
91 | precharge = mc.precharge; | |
92 | mc.precharge = 0; | |
93 | @@ -4508,11 +4508,16 @@ static void mem_cgroup_clear_mc(void) | |
94 | ||
95 | mc.moved_swap = 0; | |
96 | } | |
97 | + if (mc.mm) { | |
98 | + up_read(&mc.mm->mmap_sem); | |
99 | + mmput(mc.mm); | |
100 | + } | |
101 | spin_lock(&mc.lock); | |
102 | mc.from = NULL; | |
103 | mc.to = NULL; | |
104 | - mc.moving_task = NULL; | |
105 | spin_unlock(&mc.lock); | |
106 | + mc.moving_task = NULL; | |
107 | + mc.mm = NULL; | |
108 | memcg_oom_recover(from); | |
109 | memcg_oom_recover(to); | |
110 | wake_up_all(&mc.waitq); | |
111 | @@ -4537,26 +4542,37 @@ static int mem_cgroup_can_attach(struct | |
112 | return 0; | |
113 | /* We move charges only when we move a owner of the mm */ | |
114 | if (mm->owner == p) { | |
115 | + /* | |
116 | + * We do all the move charge works under one mmap_sem to | |
117 | + * avoid deadlock with down_write(&mmap_sem) | |
118 | + * -> try_charge() -> if (mc.moving_task) -> sleep. | |
119 | + */ | |
120 | + down_read(&mm->mmap_sem); | |
121 | + | |
122 | VM_BUG_ON(mc.from); | |
123 | VM_BUG_ON(mc.to); | |
124 | VM_BUG_ON(mc.precharge); | |
125 | VM_BUG_ON(mc.moved_charge); | |
126 | VM_BUG_ON(mc.moved_swap); | |
127 | VM_BUG_ON(mc.moving_task); | |
128 | + VM_BUG_ON(mc.mm); | |
129 | + | |
130 | spin_lock(&mc.lock); | |
131 | mc.from = from; | |
132 | mc.to = mem; | |
133 | mc.precharge = 0; | |
134 | mc.moved_charge = 0; | |
135 | mc.moved_swap = 0; | |
136 | - mc.moving_task = current; | |
137 | spin_unlock(&mc.lock); | |
138 | + mc.moving_task = current; | |
139 | + mc.mm = mm; | |
140 | ||
141 | ret = mem_cgroup_precharge_mc(mm); | |
142 | if (ret) | |
143 | mem_cgroup_clear_mc(); | |
144 | - } | |
145 | - mmput(mm); | |
146 | + /* We call up_read() and mmput() in clear_mc(). */ | |
147 | + } else | |
148 | + mmput(mm); | |
149 | } | |
150 | return ret; | |
151 | } | |
152 | @@ -4644,7 +4660,7 @@ static void mem_cgroup_move_charge(struc | |
153 | struct vm_area_struct *vma; | |
154 | ||
155 | lru_add_drain_all(); | |
156 | - down_read(&mm->mmap_sem); | |
157 | + /* We've already held the mmap_sem */ | |
158 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | |
159 | int ret; | |
160 | struct mm_walk mem_cgroup_move_charge_walk = { | |
161 | @@ -4663,7 +4679,6 @@ static void mem_cgroup_move_charge(struc | |
162 | */ | |
163 | break; | |
164 | } | |
165 | - up_read(&mm->mmap_sem); | |
166 | } | |
167 | ||
168 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |
169 | @@ -4672,17 +4687,11 @@ static void mem_cgroup_move_task(struct | |
170 | struct task_struct *p, | |
171 | bool threadgroup) | |
172 | { | |
173 | - struct mm_struct *mm; | |
174 | - | |
175 | - if (!mc.to) | |
176 | + if (!mc.mm) | |
177 | /* no need to move charge */ | |
178 | return; | |
179 | ||
180 | - mm = get_task_mm(p); | |
181 | - if (mm) { | |
182 | - mem_cgroup_move_charge(mm); | |
183 | - mmput(mm); | |
184 | - } | |
185 | + mem_cgroup_move_charge(mc.mm); | |
186 | mem_cgroup_clear_mc(); | |
187 | } | |
188 | #else /* !CONFIG_MMU */ |