]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blame - releases/2.6.36.2/memcg-avoid-deadlock-between-move-charge-and-try_charge.patch
Fixes for 5.10
[thirdparty/kernel/stable-queue.git] / releases / 2.6.36.2 / memcg-avoid-deadlock-between-move-charge-and-try_charge.patch
CommitLineData
d9b1f36d
GKH
1From b1dd693e5b9348bd68a80e679e03cf9c0973b01b Mon Sep 17 00:00:00 2001
2From: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
3Date: Wed, 24 Nov 2010 12:57:06 -0800
4Subject: memcg: avoid deadlock between move charge and try_charge()
5
6From: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
7
8commit b1dd693e5b9348bd68a80e679e03cf9c0973b01b upstream.
9
10__mem_cgroup_try_charge() can be called under down_write(&mmap_sem)(e.g.
11mlock does it). This means it can cause deadlock if it races with move charge:
12
13Ex.1)
14 move charge | try charge
15 --------------------------------------+------------------------------
16 mem_cgroup_can_attach() | down_write(&mmap_sem)
17 mc.moving_task = current | ..
18 mem_cgroup_precharge_mc() | __mem_cgroup_try_charge()
19 mem_cgroup_count_precharge() | prepare_to_wait()
20 down_read(&mmap_sem) | if (mc.moving_task)
21 -> cannot aquire the lock | -> true
22 | schedule()
23
24Ex.2)
25 move charge | try charge
26 --------------------------------------+------------------------------
27 mem_cgroup_can_attach() |
28 mc.moving_task = current |
29 mem_cgroup_precharge_mc() |
30 mem_cgroup_count_precharge() |
31 down_read(&mmap_sem) |
32 .. |
33 up_read(&mmap_sem) |
34 | down_write(&mmap_sem)
35 mem_cgroup_move_task() | ..
36 mem_cgroup_move_charge() | __mem_cgroup_try_charge()
37 down_read(&mmap_sem) | prepare_to_wait()
38 -> cannot aquire the lock | if (mc.moving_task)
39 | -> true
40 | schedule()
41
42To avoid this deadlock, we do all the move charge works (both can_attach() and
43attach()) under one mmap_sem section.
44And after this patch, we set/clear mc.moving_task outside mc.lock, because we
45use the lock only to check mc.from/to.
46
47Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
48Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
49Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
50Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
51Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
52Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
53
54---
55 mm/memcontrol.c | 43 ++++++++++++++++++++++++++-----------------
56 1 file changed, 26 insertions(+), 17 deletions(-)
57
58--- a/mm/memcontrol.c
59+++ b/mm/memcontrol.c
60@@ -269,13 +269,14 @@ enum move_type {
61
62 /* "mc" and its members are protected by cgroup_mutex */
63 static struct move_charge_struct {
64- spinlock_t lock; /* for from, to, moving_task */
65+ spinlock_t lock; /* for from, to */
66 struct mem_cgroup *from;
67 struct mem_cgroup *to;
68 unsigned long precharge;
69 unsigned long moved_charge;
70 unsigned long moved_swap;
71 struct task_struct *moving_task; /* a task moving charges */
72+ struct mm_struct *mm;
73 wait_queue_head_t waitq; /* a waitq for other context */
74 } mc = {
75 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
76@@ -4445,7 +4446,7 @@ static unsigned long mem_cgroup_count_pr
77 unsigned long precharge;
78 struct vm_area_struct *vma;
79
80- down_read(&mm->mmap_sem);
81+ /* We've already held the mmap_sem */
82 for (vma = mm->mmap; vma; vma = vma->vm_next) {
83 struct mm_walk mem_cgroup_count_precharge_walk = {
84 .pmd_entry = mem_cgroup_count_precharge_pte_range,
85@@ -4457,7 +4458,6 @@ static unsigned long mem_cgroup_count_pr
86 walk_page_range(vma->vm_start, vma->vm_end,
87 &mem_cgroup_count_precharge_walk);
88 }
89- up_read(&mm->mmap_sem);
90
91 precharge = mc.precharge;
92 mc.precharge = 0;
93@@ -4508,11 +4508,16 @@ static void mem_cgroup_clear_mc(void)
94
95 mc.moved_swap = 0;
96 }
97+ if (mc.mm) {
98+ up_read(&mc.mm->mmap_sem);
99+ mmput(mc.mm);
100+ }
101 spin_lock(&mc.lock);
102 mc.from = NULL;
103 mc.to = NULL;
104- mc.moving_task = NULL;
105 spin_unlock(&mc.lock);
106+ mc.moving_task = NULL;
107+ mc.mm = NULL;
108 memcg_oom_recover(from);
109 memcg_oom_recover(to);
110 wake_up_all(&mc.waitq);
111@@ -4537,26 +4542,37 @@ static int mem_cgroup_can_attach(struct
112 return 0;
113 /* We move charges only when we move a owner of the mm */
114 if (mm->owner == p) {
115+ /*
116+ * We do all the move charge works under one mmap_sem to
117+ * avoid deadlock with down_write(&mmap_sem)
118+ * -> try_charge() -> if (mc.moving_task) -> sleep.
119+ */
120+ down_read(&mm->mmap_sem);
121+
122 VM_BUG_ON(mc.from);
123 VM_BUG_ON(mc.to);
124 VM_BUG_ON(mc.precharge);
125 VM_BUG_ON(mc.moved_charge);
126 VM_BUG_ON(mc.moved_swap);
127 VM_BUG_ON(mc.moving_task);
128+ VM_BUG_ON(mc.mm);
129+
130 spin_lock(&mc.lock);
131 mc.from = from;
132 mc.to = mem;
133 mc.precharge = 0;
134 mc.moved_charge = 0;
135 mc.moved_swap = 0;
136- mc.moving_task = current;
137 spin_unlock(&mc.lock);
138+ mc.moving_task = current;
139+ mc.mm = mm;
140
141 ret = mem_cgroup_precharge_mc(mm);
142 if (ret)
143 mem_cgroup_clear_mc();
144- }
145- mmput(mm);
146+ /* We call up_read() and mmput() in clear_mc(). */
147+ } else
148+ mmput(mm);
149 }
150 return ret;
151 }
152@@ -4644,7 +4660,7 @@ static void mem_cgroup_move_charge(struc
153 struct vm_area_struct *vma;
154
155 lru_add_drain_all();
156- down_read(&mm->mmap_sem);
157+ /* We've already held the mmap_sem */
158 for (vma = mm->mmap; vma; vma = vma->vm_next) {
159 int ret;
160 struct mm_walk mem_cgroup_move_charge_walk = {
161@@ -4663,7 +4679,6 @@ static void mem_cgroup_move_charge(struc
162 */
163 break;
164 }
165- up_read(&mm->mmap_sem);
166 }
167
168 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
169@@ -4672,17 +4687,11 @@ static void mem_cgroup_move_task(struct
170 struct task_struct *p,
171 bool threadgroup)
172 {
173- struct mm_struct *mm;
174-
175- if (!mc.to)
176+ if (!mc.mm)
177 /* no need to move charge */
178 return;
179
180- mm = get_task_mm(p);
181- if (mm) {
182- mem_cgroup_move_charge(mm);
183- mmput(mm);
184- }
185+ mem_cgroup_move_charge(mc.mm);
186 mem_cgroup_clear_mc();
187 }
188 #else /* !CONFIG_MMU */