]>
Commit | Line | Data |
---|---|---|
00e5a55c BS |
1 | Subject: fix booting with memoryless nodes |
2 | From: haveblue@us.ibm.com | |
3 | References: 443280 - LTC49675 | |
4 | ||
5 | I've reproduced this on 2.6.27.7. I'm pretty sure it is caused by this | |
6 | patch: | |
7 | ||
8 | http://git.kernel.org/gitweb.cgi?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=8f64e1f2d1e09267ac926e15090fd505c1c0cbcb | |
9 | ||
10 | The problem is that Jon took a loop which was (in psuedocode): | |
11 | ||
12 | for_each_node(nid) | |
13 | NODE_DATA(nid) = careful_alloc(nid); | |
14 | setup_bootmem(nid); | |
15 | reserve_node_bootmem(nid); | |
16 | ||
17 | and broke it up into: | |
18 | ||
19 | for_each_node(nid) | |
20 | NODE_DATA(nid) = careful_alloc(nid); | |
21 | setup_bootmem(nid); | |
22 | for_each_node(nid) | |
23 | reserve_node_bootmem(nid); | |
24 | ||
25 | The issue comes in when the 'careful_alloc()' is called on a node with | |
26 | no memory. It falls back to using bootmem from a previously-initialized | |
27 | node. But, bootmem has not yet been reserved when Jon's patch is | |
28 | applied. It gives back bogus memory (0xc000000000000000) and pukes | |
29 | later in boot. | |
30 | ||
31 | The following patch collapses the loop back together. It also breaks | |
32 | the mark_reserved_regions_for_nid() code out into a function and adds | |
33 | some comments. I think a huge part of introducing this bug is because | |
34 | for loop was too long and hard to read. | |
35 | ||
36 | The actual bug fix here is the: | |
37 | ||
38 | + if (end_pfn <= node->node_start_pfn || | |
39 | + start_pfn >= node_end_pfn) | |
40 | + continue; | |
41 | ||
42 | Signed-off-by: Olaf Hering <olh@suse.de> | |
43 | ||
44 | --- | |
45 | arch/powerpc/mm/numa.c | 130 ++++++++++++++++++++++++++++++------------------- | |
46 | 1 file changed, 82 insertions(+), 48 deletions(-) | |
47 | ||
48 | --- a/arch/powerpc/mm/numa.c | |
49 | +++ b/arch/powerpc/mm/numa.c | |
50 | @@ -19,6 +19,7 @@ | |
51 | #include <linux/notifier.h> | |
52 | #include <linux/lmb.h> | |
53 | #include <linux/of.h> | |
54 | +#include <linux/pfn.h> | |
55 | #include <asm/sparsemem.h> | |
56 | #include <asm/prom.h> | |
57 | #include <asm/system.h> | |
58 | @@ -867,10 +868,75 @@ static struct notifier_block __cpuinitda | |
59 | .priority = 1 /* Must run before sched domains notifier. */ | |
60 | }; | |
61 | ||
62 | +static void mark_reserved_regions_for_nid(int nid) | |
63 | +{ | |
64 | + struct pglist_data *node = NODE_DATA(nid); | |
65 | + int i; | |
66 | + | |
67 | + dbg("mark_reserved_regions_for_nid(%d) NODE_DATA: %p\n", nid, node); | |
68 | + for (i = 0; i < lmb.reserved.cnt; i++) { | |
69 | + unsigned long physbase = lmb.reserved.region[i].base; | |
70 | + unsigned long size = lmb.reserved.region[i].size; | |
71 | + unsigned long start_pfn = physbase >> PAGE_SHIFT; | |
72 | + unsigned long end_pfn = PFN_UP(physbase + size); | |
73 | + struct node_active_region node_ar; | |
74 | + unsigned long node_end_pfn = node->node_start_pfn + | |
75 | + node->node_spanned_pages; | |
76 | + | |
77 | + /* | |
78 | + * Check to make sure that this lmb.reserved area is | |
79 | + * within the bounds of the node that we care about. | |
80 | + * Checking the nid of the start and end points is not | |
81 | + * sufficient because the reserved area could span the | |
82 | + * entire node. | |
83 | + */ | |
84 | + if (end_pfn <= node->node_start_pfn || | |
85 | + start_pfn >= node_end_pfn) | |
86 | + continue; | |
87 | + | |
88 | + get_node_active_region(start_pfn, &node_ar); | |
89 | + while (start_pfn < end_pfn && | |
90 | + node_ar.start_pfn < node_ar.end_pfn) { | |
91 | + unsigned long reserve_size = size; | |
92 | + /* | |
93 | + * if reserved region extends past active region | |
94 | + * then trim size to active region | |
95 | + */ | |
96 | + if (end_pfn > node_ar.end_pfn) | |
97 | + reserve_size = (node_ar.end_pfn << PAGE_SHIFT) | |
98 | + - physbase; | |
99 | + /* | |
100 | + * Only worry about *this* node, others may not | |
101 | + * yet have valid NODE_DATA(). | |
102 | + */ | |
103 | + if (node_ar.nid == nid) | |
104 | + reserve_bootmem_node(NODE_DATA(node_ar.nid), | |
105 | + physbase, reserve_size, | |
106 | + BOOTMEM_DEFAULT); | |
107 | + /* | |
108 | + * if reserved region is contained in the active region | |
109 | + * then done. | |
110 | + */ | |
111 | + if (end_pfn <= node_ar.end_pfn) | |
112 | + break; | |
113 | + | |
114 | + /* | |
115 | + * reserved region extends past the active region | |
116 | + * get next active region that contains this | |
117 | + * reserved region | |
118 | + */ | |
119 | + start_pfn = node_ar.end_pfn; | |
120 | + physbase = start_pfn << PAGE_SHIFT; | |
121 | + size = size - reserve_size; | |
122 | + get_node_active_region(start_pfn, &node_ar); | |
123 | + } | |
124 | + } | |
125 | +} | |
126 | + | |
127 | + | |
128 | void __init do_init_bootmem(void) | |
129 | { | |
130 | int nid; | |
131 | - unsigned int i; | |
132 | ||
133 | min_low_pfn = 0; | |
134 | max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT; | |
135 | @@ -890,9 +956,16 @@ void __init do_init_bootmem(void) | |
136 | unsigned long bootmem_paddr; | |
137 | unsigned long bootmap_pages; | |
138 | ||
139 | + dbg("node %d is online\n", nid); | |
140 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); | |
141 | ||
142 | - /* Allocate the node structure node local if possible */ | |
143 | + /* | |
144 | + * Allocate the node structure node local if possible | |
145 | + * | |
146 | + * Be careful moving this around, as it relies on all | |
147 | + * previous nodes' bootmem to be initialized and have | |
148 | + * all reserved areas marked. | |
149 | + */ | |
150 | NODE_DATA(nid) = careful_allocation(nid, | |
151 | sizeof(struct pglist_data), | |
152 | SMP_CACHE_BYTES, end_pfn); | |
153 | @@ -924,53 +997,14 @@ void __init do_init_bootmem(void) | |
154 | start_pfn, end_pfn); | |
155 | ||
156 | free_bootmem_with_active_regions(nid, end_pfn); | |
157 | - } | |
158 | - | |
159 | - /* Mark reserved regions */ | |
160 | - for (i = 0; i < lmb.reserved.cnt; i++) { | |
161 | - unsigned long physbase = lmb.reserved.region[i].base; | |
162 | - unsigned long size = lmb.reserved.region[i].size; | |
163 | - unsigned long start_pfn = physbase >> PAGE_SHIFT; | |
164 | - unsigned long end_pfn = ((physbase + size) >> PAGE_SHIFT); | |
165 | - struct node_active_region node_ar; | |
166 | - | |
167 | - get_node_active_region(start_pfn, &node_ar); | |
168 | - while (start_pfn < end_pfn && | |
169 | - node_ar.start_pfn < node_ar.end_pfn) { | |
170 | - unsigned long reserve_size = size; | |
171 | - /* | |
172 | - * if reserved region extends past active region | |
173 | - * then trim size to active region | |
174 | - */ | |
175 | - if (end_pfn > node_ar.end_pfn) | |
176 | - reserve_size = (node_ar.end_pfn << PAGE_SHIFT) | |
177 | - - (start_pfn << PAGE_SHIFT); | |
178 | - dbg("reserve_bootmem %lx %lx nid=%d\n", physbase, | |
179 | - reserve_size, node_ar.nid); | |
180 | - reserve_bootmem_node(NODE_DATA(node_ar.nid), physbase, | |
181 | - reserve_size, BOOTMEM_DEFAULT); | |
182 | - /* | |
183 | - * if reserved region is contained in the active region | |
184 | - * then done. | |
185 | - */ | |
186 | - if (end_pfn <= node_ar.end_pfn) | |
187 | - break; | |
188 | - | |
189 | - /* | |
190 | - * reserved region extends past the active region | |
191 | - * get next active region that contains this | |
192 | - * reserved region | |
193 | - */ | |
194 | - start_pfn = node_ar.end_pfn; | |
195 | - physbase = start_pfn << PAGE_SHIFT; | |
196 | - size = size - reserve_size; | |
197 | - get_node_active_region(start_pfn, &node_ar); | |
198 | - } | |
199 | - | |
200 | - } | |
201 | - | |
202 | - for_each_online_node(nid) | |
203 | + /* | |
204 | + * Be very careful about moving this around. Future | |
205 | + * calls to careful_allocation() depend on this getting | |
206 | + * done correctly. | |
207 | + */ | |
208 | + mark_reserved_regions_for_nid(nid); | |
209 | sparse_memory_present_with_active_regions(nid); | |
210 | + } | |
211 | } | |
212 | ||
213 | void __init paging_init(void) |