]>
Commit | Line | Data |
---|---|---|
1df8150e GKH |
1 | From bea8c150a7efbc0f204e709b7274fe273f55e0d3 Mon Sep 17 00:00:00 2001 |
2 | From: Hugh Dickins <hughd@google.com> | |
3 | Date: Fri, 16 Nov 2012 14:14:54 -0800 | |
4 | Subject: memcg: fix hotplugged memory zone oops | |
5 | ||
6 | From: Hugh Dickins <hughd@google.com> | |
7 | ||
8 | commit bea8c150a7efbc0f204e709b7274fe273f55e0d3 upstream. | |
9 | ||
10 | When MEMCG is configured on (even when it's disabled by boot option), | |
11 | when adding or removing a page to/from its lru list, the zone pointer | |
12 | used for stats updates is nowadays taken from the struct lruvec. (On | |
13 | many configurations, calculating zone from page is slower.) | |
14 | ||
15 | But we have no code to update all the lruvecs (per zone, per memcg) when | |
16 | a memory node is hotadded. Here's an extract from the oops which | |
17 | results when running numactl to bind a program to a newly onlined node: | |
18 | ||
19 | BUG: unable to handle kernel NULL pointer dereference at 0000000000000f60 | |
20 | IP: __mod_zone_page_state+0x9/0x60 | |
21 | Pid: 1219, comm: numactl Not tainted 3.6.0-rc5+ #180 Bochs Bochs | |
22 | Process numactl (pid: 1219, threadinfo ffff880039abc000, task ffff8800383c4ce0) | |
23 | Call Trace: | |
24 | __pagevec_lru_add_fn+0xdf/0x140 | |
25 | pagevec_lru_move_fn+0xb1/0x100 | |
26 | __pagevec_lru_add+0x1c/0x30 | |
27 | lru_add_drain_cpu+0xa3/0x130 | |
28 | lru_add_drain+0x2f/0x40 | |
29 | ... | |
30 | ||
31 | The natural solution might be to use a memcg callback whenever memory is | |
32 | hotadded; but that solution has not been scoped out, and it happens that | |
33 | we do have an easy location at which to update lruvec->zone. The lruvec | |
34 | pointer is discovered either by mem_cgroup_zone_lruvec() or by | |
35 | mem_cgroup_page_lruvec(), and both of those do know the right zone. | |
36 | ||
37 | So check and set lruvec->zone in those; and remove the inadequate | |
38 | attempt to set lruvec->zone from lruvec_init(), which is called before | |
39 | NODE_DATA(node) has been allocated in such cases. | |
40 | ||
41 | Ah, there was one exceptionr. For no particularly good reason, | |
42 | mem_cgroup_force_empty_list() has its own code for deciding lruvec. | |
43 | Change it to use the standard mem_cgroup_zone_lruvec() and | |
44 | mem_cgroup_get_lru_size() too. In fact it was already safe against such | |
45 | an oops (the lru lists in danger could only be empty), but we're better | |
46 | proofed against future changes this way. | |
47 | ||
48 | I've marked this for stable (3.6) since we introduced the problem in 3.5 | |
49 | (now closed to stable); but I have no idea if this is the only fix | |
50 | needed to get memory hotadd working with memcg in 3.6, and received no | |
51 | answer when I enquired twice before. | |
52 | ||
53 | Reported-by: Tang Chen <tangchen@cn.fujitsu.com> | |
54 | Signed-off-by: Hugh Dickins <hughd@google.com> | |
55 | Acked-by: Johannes Weiner <hannes@cmpxchg.org> | |
56 | Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | |
57 | Cc: Konstantin Khlebnikov <khlebnikov@openvz.org> | |
58 | Cc: Wen Congyang <wency@cn.fujitsu.com> | |
59 | Signed-off-by: Andrew Morton <akpm@linux-foundation.org> | |
60 | Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> | |
61 | Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> | |
62 | ||
63 | --- | |
64 | include/linux/mmzone.h | 2 +- | |
65 | mm/memcontrol.c | 46 +++++++++++++++++++++++++++++++++++----------- | |
66 | mm/mmzone.c | 6 +----- | |
67 | mm/page_alloc.c | 2 +- | |
68 | 4 files changed, 38 insertions(+), 18 deletions(-) | |
69 | ||
70 | --- a/include/linux/mmzone.h | |
71 | +++ b/include/linux/mmzone.h | |
72 | @@ -744,7 +744,7 @@ extern int init_currently_empty_zone(str | |
73 | unsigned long size, | |
74 | enum memmap_context context); | |
75 | ||
76 | -extern void lruvec_init(struct lruvec *lruvec, struct zone *zone); | |
77 | +extern void lruvec_init(struct lruvec *lruvec); | |
78 | ||
79 | static inline struct zone *lruvec_zone(struct lruvec *lruvec) | |
80 | { | |
81 | --- a/mm/memcontrol.c | |
82 | +++ b/mm/memcontrol.c | |
83 | @@ -1061,12 +1061,24 @@ struct lruvec *mem_cgroup_zone_lruvec(st | |
84 | struct mem_cgroup *memcg) | |
85 | { | |
86 | struct mem_cgroup_per_zone *mz; | |
87 | + struct lruvec *lruvec; | |
88 | ||
89 | - if (mem_cgroup_disabled()) | |
90 | - return &zone->lruvec; | |
91 | + if (mem_cgroup_disabled()) { | |
92 | + lruvec = &zone->lruvec; | |
93 | + goto out; | |
94 | + } | |
95 | ||
96 | mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); | |
97 | - return &mz->lruvec; | |
98 | + lruvec = &mz->lruvec; | |
99 | +out: | |
100 | + /* | |
101 | + * Since a node can be onlined after the mem_cgroup was created, | |
102 | + * we have to be prepared to initialize lruvec->zone here; | |
103 | + * and if offlined then reonlined, we need to reinitialize it. | |
104 | + */ | |
105 | + if (unlikely(lruvec->zone != zone)) | |
106 | + lruvec->zone = zone; | |
107 | + return lruvec; | |
108 | } | |
109 | ||
110 | /* | |
111 | @@ -1093,9 +1105,12 @@ struct lruvec *mem_cgroup_page_lruvec(st | |
112 | struct mem_cgroup_per_zone *mz; | |
113 | struct mem_cgroup *memcg; | |
114 | struct page_cgroup *pc; | |
115 | + struct lruvec *lruvec; | |
116 | ||
117 | - if (mem_cgroup_disabled()) | |
118 | - return &zone->lruvec; | |
119 | + if (mem_cgroup_disabled()) { | |
120 | + lruvec = &zone->lruvec; | |
121 | + goto out; | |
122 | + } | |
123 | ||
124 | pc = lookup_page_cgroup(page); | |
125 | memcg = pc->mem_cgroup; | |
126 | @@ -1113,7 +1128,16 @@ struct lruvec *mem_cgroup_page_lruvec(st | |
127 | pc->mem_cgroup = memcg = root_mem_cgroup; | |
128 | ||
129 | mz = page_cgroup_zoneinfo(memcg, page); | |
130 | - return &mz->lruvec; | |
131 | + lruvec = &mz->lruvec; | |
132 | +out: | |
133 | + /* | |
134 | + * Since a node can be onlined after the mem_cgroup was created, | |
135 | + * we have to be prepared to initialize lruvec->zone here; | |
136 | + * and if offlined then reonlined, we need to reinitialize it. | |
137 | + */ | |
138 | + if (unlikely(lruvec->zone != zone)) | |
139 | + lruvec->zone = zone; | |
140 | + return lruvec; | |
141 | } | |
142 | ||
143 | /** | |
144 | @@ -3703,17 +3727,17 @@ unsigned long mem_cgroup_soft_limit_recl | |
145 | static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |
146 | int node, int zid, enum lru_list lru) | |
147 | { | |
148 | - struct mem_cgroup_per_zone *mz; | |
149 | + struct lruvec *lruvec; | |
150 | unsigned long flags, loop; | |
151 | struct list_head *list; | |
152 | struct page *busy; | |
153 | struct zone *zone; | |
154 | ||
155 | zone = &NODE_DATA(node)->node_zones[zid]; | |
156 | - mz = mem_cgroup_zoneinfo(memcg, node, zid); | |
157 | - list = &mz->lruvec.lists[lru]; | |
158 | + lruvec = mem_cgroup_zone_lruvec(zone, memcg); | |
159 | + list = &lruvec->lists[lru]; | |
160 | ||
161 | - loop = mz->lru_size[lru]; | |
162 | + loop = mem_cgroup_get_lru_size(lruvec, lru); | |
163 | /* give some margin against EBUSY etc...*/ | |
164 | loop += 256; | |
165 | busy = NULL; | |
166 | @@ -4751,7 +4775,7 @@ static int alloc_mem_cgroup_per_zone_inf | |
167 | ||
168 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | |
169 | mz = &pn->zoneinfo[zone]; | |
170 | - lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]); | |
171 | + lruvec_init(&mz->lruvec); | |
172 | mz->usage_in_excess = 0; | |
173 | mz->on_tree = false; | |
174 | mz->memcg = memcg; | |
175 | --- a/mm/mmzone.c | |
176 | +++ b/mm/mmzone.c | |
177 | @@ -87,7 +87,7 @@ int memmap_valid_within(unsigned long pf | |
178 | } | |
179 | #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ | |
180 | ||
181 | -void lruvec_init(struct lruvec *lruvec, struct zone *zone) | |
182 | +void lruvec_init(struct lruvec *lruvec) | |
183 | { | |
184 | enum lru_list lru; | |
185 | ||
186 | @@ -95,8 +95,4 @@ void lruvec_init(struct lruvec *lruvec, | |
187 | ||
188 | for_each_lru(lru) | |
189 | INIT_LIST_HEAD(&lruvec->lists[lru]); | |
190 | - | |
191 | -#ifdef CONFIG_MEMCG | |
192 | - lruvec->zone = zone; | |
193 | -#endif | |
194 | } | |
195 | --- a/mm/page_alloc.c | |
196 | +++ b/mm/page_alloc.c | |
197 | @@ -4456,7 +4456,7 @@ static void __paginginit free_area_init_ | |
198 | zone->zone_pgdat = pgdat; | |
199 | ||
200 | zone_pcp_init(zone); | |
201 | - lruvec_init(&zone->lruvec, zone); | |
202 | + lruvec_init(&zone->lruvec); | |
203 | if (!size) | |
204 | continue; | |
205 |