]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blame - releases/3.6.8/memcg-fix-hotplugged-memory-zone-oops.patch
Fixes for 4.19
[thirdparty/kernel/stable-queue.git] / releases / 3.6.8 / memcg-fix-hotplugged-memory-zone-oops.patch
CommitLineData
1df8150e
GKH
1From bea8c150a7efbc0f204e709b7274fe273f55e0d3 Mon Sep 17 00:00:00 2001
2From: Hugh Dickins <hughd@google.com>
3Date: Fri, 16 Nov 2012 14:14:54 -0800
4Subject: memcg: fix hotplugged memory zone oops
5
6From: Hugh Dickins <hughd@google.com>
7
8commit bea8c150a7efbc0f204e709b7274fe273f55e0d3 upstream.
9
10When MEMCG is configured on (even when it's disabled by boot option),
11when adding or removing a page to/from its lru list, the zone pointer
12used for stats updates is nowadays taken from the struct lruvec. (On
13many configurations, calculating zone from page is slower.)
14
15But we have no code to update all the lruvecs (per zone, per memcg) when
16a memory node is hotadded. Here's an extract from the oops which
17results when running numactl to bind a program to a newly onlined node:
18
19 BUG: unable to handle kernel NULL pointer dereference at 0000000000000f60
20 IP: __mod_zone_page_state+0x9/0x60
21 Pid: 1219, comm: numactl Not tainted 3.6.0-rc5+ #180 Bochs Bochs
22 Process numactl (pid: 1219, threadinfo ffff880039abc000, task ffff8800383c4ce0)
23 Call Trace:
24 __pagevec_lru_add_fn+0xdf/0x140
25 pagevec_lru_move_fn+0xb1/0x100
26 __pagevec_lru_add+0x1c/0x30
27 lru_add_drain_cpu+0xa3/0x130
28 lru_add_drain+0x2f/0x40
29 ...
30
31The natural solution might be to use a memcg callback whenever memory is
32hotadded; but that solution has not been scoped out, and it happens that
33we do have an easy location at which to update lruvec->zone. The lruvec
34pointer is discovered either by mem_cgroup_zone_lruvec() or by
35mem_cgroup_page_lruvec(), and both of those do know the right zone.
36
37So check and set lruvec->zone in those; and remove the inadequate
38attempt to set lruvec->zone from lruvec_init(), which is called before
39NODE_DATA(node) has been allocated in such cases.
40
41Ah, there was one exceptionr. For no particularly good reason,
42mem_cgroup_force_empty_list() has its own code for deciding lruvec.
43Change it to use the standard mem_cgroup_zone_lruvec() and
44mem_cgroup_get_lru_size() too. In fact it was already safe against such
45an oops (the lru lists in danger could only be empty), but we're better
46proofed against future changes this way.
47
48I've marked this for stable (3.6) since we introduced the problem in 3.5
49(now closed to stable); but I have no idea if this is the only fix
50needed to get memory hotadd working with memcg in 3.6, and received no
51answer when I enquired twice before.
52
53Reported-by: Tang Chen <tangchen@cn.fujitsu.com>
54Signed-off-by: Hugh Dickins <hughd@google.com>
55Acked-by: Johannes Weiner <hannes@cmpxchg.org>
56Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
57Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
58Cc: Wen Congyang <wency@cn.fujitsu.com>
59Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
60Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
61Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
62
63---
64 include/linux/mmzone.h | 2 +-
65 mm/memcontrol.c | 46 +++++++++++++++++++++++++++++++++++-----------
66 mm/mmzone.c | 6 +-----
67 mm/page_alloc.c | 2 +-
68 4 files changed, 38 insertions(+), 18 deletions(-)
69
70--- a/include/linux/mmzone.h
71+++ b/include/linux/mmzone.h
72@@ -744,7 +744,7 @@ extern int init_currently_empty_zone(str
73 unsigned long size,
74 enum memmap_context context);
75
76-extern void lruvec_init(struct lruvec *lruvec, struct zone *zone);
77+extern void lruvec_init(struct lruvec *lruvec);
78
79 static inline struct zone *lruvec_zone(struct lruvec *lruvec)
80 {
81--- a/mm/memcontrol.c
82+++ b/mm/memcontrol.c
83@@ -1061,12 +1061,24 @@ struct lruvec *mem_cgroup_zone_lruvec(st
84 struct mem_cgroup *memcg)
85 {
86 struct mem_cgroup_per_zone *mz;
87+ struct lruvec *lruvec;
88
89- if (mem_cgroup_disabled())
90- return &zone->lruvec;
91+ if (mem_cgroup_disabled()) {
92+ lruvec = &zone->lruvec;
93+ goto out;
94+ }
95
96 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
97- return &mz->lruvec;
98+ lruvec = &mz->lruvec;
99+out:
100+ /*
101+ * Since a node can be onlined after the mem_cgroup was created,
102+ * we have to be prepared to initialize lruvec->zone here;
103+ * and if offlined then reonlined, we need to reinitialize it.
104+ */
105+ if (unlikely(lruvec->zone != zone))
106+ lruvec->zone = zone;
107+ return lruvec;
108 }
109
110 /*
111@@ -1093,9 +1105,12 @@ struct lruvec *mem_cgroup_page_lruvec(st
112 struct mem_cgroup_per_zone *mz;
113 struct mem_cgroup *memcg;
114 struct page_cgroup *pc;
115+ struct lruvec *lruvec;
116
117- if (mem_cgroup_disabled())
118- return &zone->lruvec;
119+ if (mem_cgroup_disabled()) {
120+ lruvec = &zone->lruvec;
121+ goto out;
122+ }
123
124 pc = lookup_page_cgroup(page);
125 memcg = pc->mem_cgroup;
126@@ -1113,7 +1128,16 @@ struct lruvec *mem_cgroup_page_lruvec(st
127 pc->mem_cgroup = memcg = root_mem_cgroup;
128
129 mz = page_cgroup_zoneinfo(memcg, page);
130- return &mz->lruvec;
131+ lruvec = &mz->lruvec;
132+out:
133+ /*
134+ * Since a node can be onlined after the mem_cgroup was created,
135+ * we have to be prepared to initialize lruvec->zone here;
136+ * and if offlined then reonlined, we need to reinitialize it.
137+ */
138+ if (unlikely(lruvec->zone != zone))
139+ lruvec->zone = zone;
140+ return lruvec;
141 }
142
143 /**
144@@ -3703,17 +3727,17 @@ unsigned long mem_cgroup_soft_limit_recl
145 static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
146 int node, int zid, enum lru_list lru)
147 {
148- struct mem_cgroup_per_zone *mz;
149+ struct lruvec *lruvec;
150 unsigned long flags, loop;
151 struct list_head *list;
152 struct page *busy;
153 struct zone *zone;
154
155 zone = &NODE_DATA(node)->node_zones[zid];
156- mz = mem_cgroup_zoneinfo(memcg, node, zid);
157- list = &mz->lruvec.lists[lru];
158+ lruvec = mem_cgroup_zone_lruvec(zone, memcg);
159+ list = &lruvec->lists[lru];
160
161- loop = mz->lru_size[lru];
162+ loop = mem_cgroup_get_lru_size(lruvec, lru);
163 /* give some margin against EBUSY etc...*/
164 loop += 256;
165 busy = NULL;
166@@ -4751,7 +4775,7 @@ static int alloc_mem_cgroup_per_zone_inf
167
168 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
169 mz = &pn->zoneinfo[zone];
170- lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
171+ lruvec_init(&mz->lruvec);
172 mz->usage_in_excess = 0;
173 mz->on_tree = false;
174 mz->memcg = memcg;
175--- a/mm/mmzone.c
176+++ b/mm/mmzone.c
177@@ -87,7 +87,7 @@ int memmap_valid_within(unsigned long pf
178 }
179 #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
180
181-void lruvec_init(struct lruvec *lruvec, struct zone *zone)
182+void lruvec_init(struct lruvec *lruvec)
183 {
184 enum lru_list lru;
185
186@@ -95,8 +95,4 @@ void lruvec_init(struct lruvec *lruvec,
187
188 for_each_lru(lru)
189 INIT_LIST_HEAD(&lruvec->lists[lru]);
190-
191-#ifdef CONFIG_MEMCG
192- lruvec->zone = zone;
193-#endif
194 }
195--- a/mm/page_alloc.c
196+++ b/mm/page_alloc.c
197@@ -4456,7 +4456,7 @@ static void __paginginit free_area_init_
198 zone->zone_pgdat = pgdat;
199
200 zone_pcp_init(zone);
201- lruvec_init(&zone->lruvec, zone);
202+ lruvec_init(&zone->lruvec);
203 if (!size)
204 continue;
205