+++ /dev/null
-Subject: fix booting with memoryless nodes
-From: haveblue@us.ibm.com
-References: 443280 - LTC49675
-
-I've reproduced this on 2.6.27.7. I'm pretty sure it is caused by this
-patch:
-
-http://git.kernel.org/gitweb.cgi?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=8f64e1f2d1e09267ac926e15090fd505c1c0cbcb
-
-The problem is that Jon took a loop which was (in psuedocode):
-
- for_each_node(nid)
- NODE_DATA(nid) = careful_alloc(nid);
- setup_bootmem(nid);
- reserve_node_bootmem(nid);
-
-and broke it up into:
-
- for_each_node(nid)
- NODE_DATA(nid) = careful_alloc(nid);
- setup_bootmem(nid);
- for_each_node(nid)
- reserve_node_bootmem(nid);
-
-The issue comes in when the 'careful_alloc()' is called on a node with
-no memory. It falls back to using bootmem from a previously-initialized
-node. But, bootmem has not yet been reserved when Jon's patch is
-applied. It gives back bogus memory (0xc000000000000000) and pukes
-later in boot.
-
-The following patch collapses the loop back together. It also breaks
-the mark_reserved_regions_for_nid() code out into a function and adds
-some comments. I think a huge part of introducing this bug is because
-for loop was too long and hard to read.
-
-The actual bug fix here is the:
-
-+ if (end_pfn <= node->node_start_pfn ||
-+ start_pfn >= node_end_pfn)
-+ continue;
-
-Signed-off-by: Olaf Hering <olh@suse.de>
-
----
- arch/powerpc/mm/numa.c | 130 ++++++++++++++++++++++++++++++-------------------
- 1 file changed, 82 insertions(+), 48 deletions(-)
-
---- a/arch/powerpc/mm/numa.c
-+++ b/arch/powerpc/mm/numa.c
-@@ -19,6 +19,7 @@
- #include <linux/notifier.h>
- #include <linux/lmb.h>
- #include <linux/of.h>
-+#include <linux/pfn.h>
- #include <asm/sparsemem.h>
- #include <asm/prom.h>
- #include <asm/system.h>
-@@ -867,10 +868,75 @@ static struct notifier_block __cpuinitda
- .priority = 1 /* Must run before sched domains notifier. */
- };
-
-+static void mark_reserved_regions_for_nid(int nid)
-+{
-+ struct pglist_data *node = NODE_DATA(nid);
-+ int i;
-+
-+ dbg("mark_reserved_regions_for_nid(%d) NODE_DATA: %p\n", nid, node);
-+ for (i = 0; i < lmb.reserved.cnt; i++) {
-+ unsigned long physbase = lmb.reserved.region[i].base;
-+ unsigned long size = lmb.reserved.region[i].size;
-+ unsigned long start_pfn = physbase >> PAGE_SHIFT;
-+ unsigned long end_pfn = PFN_UP(physbase + size);
-+ struct node_active_region node_ar;
-+ unsigned long node_end_pfn = node->node_start_pfn +
-+ node->node_spanned_pages;
-+
-+ /*
-+ * Check to make sure that this lmb.reserved area is
-+ * within the bounds of the node that we care about.
-+ * Checking the nid of the start and end points is not
-+ * sufficient because the reserved area could span the
-+ * entire node.
-+ */
-+ if (end_pfn <= node->node_start_pfn ||
-+ start_pfn >= node_end_pfn)
-+ continue;
-+
-+ get_node_active_region(start_pfn, &node_ar);
-+ while (start_pfn < end_pfn &&
-+ node_ar.start_pfn < node_ar.end_pfn) {
-+ unsigned long reserve_size = size;
-+ /*
-+ * if reserved region extends past active region
-+ * then trim size to active region
-+ */
-+ if (end_pfn > node_ar.end_pfn)
-+ reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
-+ - physbase;
-+ /*
-+ * Only worry about *this* node, others may not
-+ * yet have valid NODE_DATA().
-+ */
-+ if (node_ar.nid == nid)
-+ reserve_bootmem_node(NODE_DATA(node_ar.nid),
-+ physbase, reserve_size,
-+ BOOTMEM_DEFAULT);
-+ /*
-+ * if reserved region is contained in the active region
-+ * then done.
-+ */
-+ if (end_pfn <= node_ar.end_pfn)
-+ break;
-+
-+ /*
-+ * reserved region extends past the active region
-+ * get next active region that contains this
-+ * reserved region
-+ */
-+ start_pfn = node_ar.end_pfn;
-+ physbase = start_pfn << PAGE_SHIFT;
-+ size = size - reserve_size;
-+ get_node_active_region(start_pfn, &node_ar);
-+ }
-+ }
-+}
-+
-+
- void __init do_init_bootmem(void)
- {
- int nid;
-- unsigned int i;
-
- min_low_pfn = 0;
- max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
-@@ -890,9 +956,16 @@ void __init do_init_bootmem(void)
- unsigned long bootmem_paddr;
- unsigned long bootmap_pages;
-
-+ dbg("node %d is online\n", nid);
- get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
-
-- /* Allocate the node structure node local if possible */
-+ /*
-+ * Allocate the node structure node local if possible
-+ *
-+ * Be careful moving this around, as it relies on all
-+ * previous nodes' bootmem to be initialized and have
-+ * all reserved areas marked.
-+ */
- NODE_DATA(nid) = careful_allocation(nid,
- sizeof(struct pglist_data),
- SMP_CACHE_BYTES, end_pfn);
-@@ -924,53 +997,14 @@ void __init do_init_bootmem(void)
- start_pfn, end_pfn);
-
- free_bootmem_with_active_regions(nid, end_pfn);
-- }
--
-- /* Mark reserved regions */
-- for (i = 0; i < lmb.reserved.cnt; i++) {
-- unsigned long physbase = lmb.reserved.region[i].base;
-- unsigned long size = lmb.reserved.region[i].size;
-- unsigned long start_pfn = physbase >> PAGE_SHIFT;
-- unsigned long end_pfn = ((physbase + size) >> PAGE_SHIFT);
-- struct node_active_region node_ar;
--
-- get_node_active_region(start_pfn, &node_ar);
-- while (start_pfn < end_pfn &&
-- node_ar.start_pfn < node_ar.end_pfn) {
-- unsigned long reserve_size = size;
-- /*
-- * if reserved region extends past active region
-- * then trim size to active region
-- */
-- if (end_pfn > node_ar.end_pfn)
-- reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
-- - (start_pfn << PAGE_SHIFT);
-- dbg("reserve_bootmem %lx %lx nid=%d\n", physbase,
-- reserve_size, node_ar.nid);
-- reserve_bootmem_node(NODE_DATA(node_ar.nid), physbase,
-- reserve_size, BOOTMEM_DEFAULT);
-- /*
-- * if reserved region is contained in the active region
-- * then done.
-- */
-- if (end_pfn <= node_ar.end_pfn)
-- break;
--
-- /*
-- * reserved region extends past the active region
-- * get next active region that contains this
-- * reserved region
-- */
-- start_pfn = node_ar.end_pfn;
-- physbase = start_pfn << PAGE_SHIFT;
-- size = size - reserve_size;
-- get_node_active_region(start_pfn, &node_ar);
-- }
--
-- }
--
-- for_each_online_node(nid)
-+ /*
-+ * Be very careful about moving this around. Future
-+ * calls to careful_allocation() depend on this getting
-+ * done correctly.
-+ */
-+ mark_reserved_regions_for_nid(nid);
- sparse_memory_present_with_active_regions(nid);
-+ }
- }
-
- void __init paging_init(void)