--- /dev/null
+From f85086f95fa36194eb0db5cd5c12e56801b98523 Mon Sep 17 00:00:00 2001
+From: Laurent Dufour <ldufour@linux.ibm.com>
+Date: Fri, 25 Sep 2020 21:19:31 -0700
+Subject: mm: don't rely on system state to detect hot-plug operations
+
+From: Laurent Dufour <ldufour@linux.ibm.com>
+
+commit f85086f95fa36194eb0db5cd5c12e56801b98523 upstream.
+
+In register_mem_sect_under_node() the system_state's value is checked to
+detect whether the call is made during boot time or during an hot-plug
+operation. Unfortunately, that check against SYSTEM_BOOTING is wrong
+because regular memory is registered at SYSTEM_SCHEDULING state. In
+addition, memory hot-plug operation can be triggered at this system
+state by the ACPI [1]. So checking against the system state is not
+enough.
+
+The consequence is that on system with interleaved node's ranges like this:
+
+ Early memory node ranges
+ node 1: [mem 0x0000000000000000-0x000000011fffffff]
+ node 2: [mem 0x0000000120000000-0x000000014fffffff]
+ node 1: [mem 0x0000000150000000-0x00000001ffffffff]
+ node 0: [mem 0x0000000200000000-0x000000048fffffff]
+ node 2: [mem 0x0000000490000000-0x00000007ffffffff]
+
+This can be seen on PowerPC LPAR after multiple memory hot-plug and
+hot-unplug operations are done. At the next reboot the node's memory
+ranges can be interleaved and since the call to link_mem_sections() is
+made in topology_init() while the system is in the SYSTEM_SCHEDULING
+state, the node's id is not checked, and the sections registered to
+multiple nodes:
+
+ $ ls -l /sys/devices/system/memory/memory21/node*
+ total 0
+ lrwxrwxrwx 1 root root 0 Aug 24 05:27 node1 -> ../../node/node1
+ lrwxrwxrwx 1 root root 0 Aug 24 05:27 node2 -> ../../node/node2
+
+In that case, the system is able to boot but if later one of theses
+memory blocks is hot-unplugged and then hot-plugged, the sysfs
+inconsistency is detected and this is triggering a BUG_ON():
+
+ kernel BUG at /Users/laurent/src/linux-ppc/mm/memory_hotplug.c:1084!
+ Oops: Exception in kernel mode, sig: 5 [#1]
+ LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
+ Modules linked in: rpadlpar_io rpaphp pseries_rng rng_core vmx_crypto gf128mul binfmt_misc ip_tables x_tables xfs libcrc32c crc32c_vpmsum autofs4
+ CPU: 8 PID: 10256 Comm: drmgr Not tainted 5.9.0-rc1+ #25
+ Call Trace:
+ add_memory_resource+0x23c/0x340 (unreliable)
+ __add_memory+0x5c/0xf0
+ dlpar_add_lmb+0x1b4/0x500
+ dlpar_memory+0x1f8/0xb80
+ handle_dlpar_errorlog+0xc0/0x190
+ dlpar_store+0x198/0x4a0
+ kobj_attr_store+0x30/0x50
+ sysfs_kf_write+0x64/0x90
+ kernfs_fop_write+0x1b0/0x290
+ vfs_write+0xe8/0x290
+ ksys_write+0xdc/0x130
+ system_call_exception+0x160/0x270
+ system_call_common+0xf0/0x27c
+
+This patch addresses the root cause by not relying on the system_state
+value to detect whether the call is due to a hot-plug operation. An
+extra parameter is added to link_mem_sections() detailing whether the
+operation is due to a hot-plug operation.
+
+[1] According to Oscar Salvador, using this qemu command line, ACPI
+memory hotplug operations are raised at SYSTEM_SCHEDULING state:
+
+ $QEMU -enable-kvm -machine pc -smp 4,sockets=4,cores=1,threads=1 -cpu host -monitor pty \
+ -m size=$MEM,slots=255,maxmem=4294967296k \
+ -numa node,nodeid=0,cpus=0-3,mem=512 -numa node,nodeid=1,mem=512 \
+ -object memory-backend-ram,id=memdimm0,size=134217728 -device pc-dimm,node=0,memdev=memdimm0,id=dimm0,slot=0 \
+ -object memory-backend-ram,id=memdimm1,size=134217728 -device pc-dimm,node=0,memdev=memdimm1,id=dimm1,slot=1 \
+ -object memory-backend-ram,id=memdimm2,size=134217728 -device pc-dimm,node=0,memdev=memdimm2,id=dimm2,slot=2 \
+ -object memory-backend-ram,id=memdimm3,size=134217728 -device pc-dimm,node=0,memdev=memdimm3,id=dimm3,slot=3 \
+ -object memory-backend-ram,id=memdimm4,size=134217728 -device pc-dimm,node=1,memdev=memdimm4,id=dimm4,slot=4 \
+ -object memory-backend-ram,id=memdimm5,size=134217728 -device pc-dimm,node=1,memdev=memdimm5,id=dimm5,slot=5 \
+ -object memory-backend-ram,id=memdimm6,size=134217728 -device pc-dimm,node=1,memdev=memdimm6,id=dimm6,slot=6 \
+
+Fixes: 4fbce633910e ("mm/memory_hotplug.c: make register_mem_sect_under_node() a callback of walk_memory_range()")
+Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Reviewed-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Oscar Salvador <osalvador@suse.de>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: "Rafael J. Wysocki" <rafael@kernel.org>
+Cc: Fenghua Yu <fenghua.yu@intel.com>
+Cc: Nathan Lynch <nathanl@linux.ibm.com>
+Cc: Scott Cheloha <cheloha@linux.ibm.com>
+Cc: Tony Luck <tony.luck@intel.com>
+Cc: <stable@vger.kernel.org>
+Link: https://lkml.kernel.org/r/20200915094143.79181-3-ldufour@linux.ibm.com
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/base/node.c | 84 ++++++++++++++++++++++++++++++++-------------------
+ include/linux/node.h | 11 ++++--
+ mm/memory_hotplug.c | 3 +
+ 3 files changed, 63 insertions(+), 35 deletions(-)
+
+--- a/drivers/base/node.c
++++ b/drivers/base/node.c
+@@ -403,10 +403,32 @@ static int __ref get_nid_for_pfn(unsigne
+ return pfn_to_nid(pfn);
+ }
+
++static int do_register_memory_block_under_node(int nid,
++ struct memory_block *mem_blk)
++{
++ int ret;
++
++ /*
++ * If this memory block spans multiple nodes, we only indicate
++ * the last processed node.
++ */
++ mem_blk->nid = nid;
++
++ ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
++ &mem_blk->dev.kobj,
++ kobject_name(&mem_blk->dev.kobj));
++ if (ret)
++ return ret;
++
++ return sysfs_create_link_nowarn(&mem_blk->dev.kobj,
++ &node_devices[nid]->dev.kobj,
++ kobject_name(&node_devices[nid]->dev.kobj));
++}
++
+ /* register memory section under specified node if it spans that node */
+-int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg)
++int register_mem_block_under_node_early(struct memory_block *mem_blk, void *arg)
+ {
+- int ret, nid = *(int *)arg;
++ int nid = *(int *)arg;
+ unsigned long pfn, sect_start_pfn, sect_end_pfn;
+
+ sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
+@@ -426,39 +448,34 @@ int register_mem_sect_under_node(struct
+ }
+
+ /*
+- * We need to check if page belongs to nid only for the boot
+- * case, during hotplug we know that all pages in the memory
+- * block belong to the same node.
+- */
+- if (system_state == SYSTEM_BOOTING) {
+- page_nid = get_nid_for_pfn(pfn);
+- if (page_nid < 0)
+- continue;
+- if (page_nid != nid)
+- continue;
+- }
+-
+- /*
+- * If this memory block spans multiple nodes, we only indicate
+- * the last processed node.
++ * We need to check if page belongs to nid only at the boot
++ * case because node's ranges can be interleaved.
+ */
+- mem_blk->nid = nid;
+-
+- ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
+- &mem_blk->dev.kobj,
+- kobject_name(&mem_blk->dev.kobj));
+- if (ret)
+- return ret;
++ page_nid = get_nid_for_pfn(pfn);
++ if (page_nid < 0)
++ continue;
++ if (page_nid != nid)
++ continue;
+
+- return sysfs_create_link_nowarn(&mem_blk->dev.kobj,
+- &node_devices[nid]->dev.kobj,
+- kobject_name(&node_devices[nid]->dev.kobj));
++ return do_register_memory_block_under_node(nid, mem_blk);
+ }
+ /* mem section does not span the specified node */
+ return 0;
+ }
+
+ /*
++ * During hotplug we know that all pages in the memory block belong to the same
++ * node.
++ */
++static int register_mem_block_under_node_hotplug(struct memory_block *mem_blk,
++ void *arg)
++{
++ int nid = *(int *)arg;
++
++ return do_register_memory_block_under_node(nid, mem_blk);
++}
++
++/*
+ * Unregister a memory block device under the node it spans. Memory blocks
+ * with multiple nodes cannot be offlined and therefore also never be removed.
+ */
+@@ -473,10 +490,17 @@ void unregister_memory_block_under_nodes
+ kobject_name(&node_devices[mem_blk->nid]->dev.kobj));
+ }
+
+-int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)
++int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn,
++ enum meminit_context context)
+ {
+- return walk_memory_range(start_pfn, end_pfn, (void *)&nid,
+- register_mem_sect_under_node);
++ walk_memory_blocks_func_t func;
++
++ if (context == MEMINIT_HOTPLUG)
++ func = register_mem_block_under_node_hotplug;
++ else
++ func = register_mem_block_under_node_early;
++
++ return walk_memory_range(start_pfn, end_pfn, (void *)&nid, func);
+ }
+
+ #ifdef CONFIG_HUGETLBFS
+--- a/include/linux/node.h
++++ b/include/linux/node.h
+@@ -32,11 +32,13 @@ extern struct node *node_devices[];
+ typedef void (*node_registration_func_t)(struct node *);
+
+ #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA)
+-extern int link_mem_sections(int nid, unsigned long start_pfn,
+- unsigned long end_pfn);
++int link_mem_sections(int nid, unsigned long start_pfn,
++ unsigned long end_pfn,
++ enum meminit_context context);
+ #else
+ static inline int link_mem_sections(int nid, unsigned long start_pfn,
+- unsigned long end_pfn)
++ unsigned long end_pfn,
++ enum meminit_context context)
+ {
+ return 0;
+ }
+@@ -61,7 +63,8 @@ static inline int register_one_node(int
+ if (error)
+ return error;
+ /* link memory sections under this node */
+- error = link_mem_sections(nid, start_pfn, end_pfn);
++ error = link_mem_sections(nid, start_pfn, end_pfn,
++ MEMINIT_EARLY);
+ }
+
+ return error;
+--- a/mm/memory_hotplug.c
++++ b/mm/memory_hotplug.c
+@@ -1102,7 +1102,8 @@ int __ref add_memory_resource(int nid, s
+ }
+
+ /* link memory sections under this node.*/
+- ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1));
++ ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
++ MEMINIT_HOTPLUG);
+ BUG_ON(ret);
+
+ /* create new memmap entry */
--- /dev/null
+From c1d0da83358a2316d9be7f229f26126dbaa07468 Mon Sep 17 00:00:00 2001
+From: Laurent Dufour <ldufour@linux.ibm.com>
+Date: Fri, 25 Sep 2020 21:19:28 -0700
+Subject: mm: replace memmap_context by meminit_context
+
+From: Laurent Dufour <ldufour@linux.ibm.com>
+
+commit c1d0da83358a2316d9be7f229f26126dbaa07468 upstream.
+
+Patch series "mm: fix memory to node bad links in sysfs", v3.
+
+Sometimes, firmware may expose interleaved memory layout like this:
+
+ Early memory node ranges
+ node 1: [mem 0x0000000000000000-0x000000011fffffff]
+ node 2: [mem 0x0000000120000000-0x000000014fffffff]
+ node 1: [mem 0x0000000150000000-0x00000001ffffffff]
+ node 0: [mem 0x0000000200000000-0x000000048fffffff]
+ node 2: [mem 0x0000000490000000-0x00000007ffffffff]
+
+In that case, we can see memory blocks assigned to multiple nodes in
+sysfs:
+
+ $ ls -l /sys/devices/system/memory/memory21
+ total 0
+ lrwxrwxrwx 1 root root 0 Aug 24 05:27 node1 -> ../../node/node1
+ lrwxrwxrwx 1 root root 0 Aug 24 05:27 node2 -> ../../node/node2
+ -rw-r--r-- 1 root root 65536 Aug 24 05:27 online
+ -r--r--r-- 1 root root 65536 Aug 24 05:27 phys_device
+ -r--r--r-- 1 root root 65536 Aug 24 05:27 phys_index
+ drwxr-xr-x 2 root root 0 Aug 24 05:27 power
+ -r--r--r-- 1 root root 65536 Aug 24 05:27 removable
+ -rw-r--r-- 1 root root 65536 Aug 24 05:27 state
+ lrwxrwxrwx 1 root root 0 Aug 24 05:25 subsystem -> ../../../../bus/memory
+ -rw-r--r-- 1 root root 65536 Aug 24 05:25 uevent
+ -r--r--r-- 1 root root 65536 Aug 24 05:27 valid_zones
+
+The same applies in the node's directory with a memory21 link in both
+the node1 and node2's directory.
+
+This is wrong but doesn't prevent the system to run. However when
+later, one of these memory blocks is hot-unplugged and then hot-plugged,
+the system is detecting an inconsistency in the sysfs layout and a
+BUG_ON() is raised:
+
+ kernel BUG at /Users/laurent/src/linux-ppc/mm/memory_hotplug.c:1084!
+ LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
+ Modules linked in: rpadlpar_io rpaphp pseries_rng rng_core vmx_crypto gf128mul binfmt_misc ip_tables x_tables xfs libcrc32c crc32c_vpmsum autofs4
+ CPU: 8 PID: 10256 Comm: drmgr Not tainted 5.9.0-rc1+ #25
+ Call Trace:
+ add_memory_resource+0x23c/0x340 (unreliable)
+ __add_memory+0x5c/0xf0
+ dlpar_add_lmb+0x1b4/0x500
+ dlpar_memory+0x1f8/0xb80
+ handle_dlpar_errorlog+0xc0/0x190
+ dlpar_store+0x198/0x4a0
+ kobj_attr_store+0x30/0x50
+ sysfs_kf_write+0x64/0x90
+ kernfs_fop_write+0x1b0/0x290
+ vfs_write+0xe8/0x290
+ ksys_write+0xdc/0x130
+ system_call_exception+0x160/0x270
+ system_call_common+0xf0/0x27c
+
+This has been seen on PowerPC LPAR.
+
+The root cause of this issue is that when node's memory is registered,
+the range used can overlap another node's range, thus the memory block
+is registered to multiple nodes in sysfs.
+
+There are two issues here:
+
+ (a) The sysfs memory and node's layouts are broken due to these
+ multiple links
+
+ (b) The link errors in link_mem_sections() should not lead to a system
+ panic.
+
+To address (a) register_mem_sect_under_node should not rely on the
+system state to detect whether the link operation is triggered by a hot
+plug operation or not. This is addressed by the patches 1 and 2 of this
+series.
+
+Issue (b) will be addressed separately.
+
+This patch (of 2):
+
+The memmap_context enum is used to detect whether a memory operation is
+due to a hot-add operation or happening at boot time.
+
+Make it general to the hotplug operation and rename it as
+meminit_context.
+
+There is no functional change introduced by this patch
+
+Suggested-by: David Hildenbrand <david@redhat.com>
+Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Reviewed-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Oscar Salvador <osalvador@suse.de>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Cc: "Rafael J . Wysocki" <rafael@kernel.org>
+Cc: Nathan Lynch <nathanl@linux.ibm.com>
+Cc: Scott Cheloha <cheloha@linux.ibm.com>
+Cc: Tony Luck <tony.luck@intel.com>
+Cc: Fenghua Yu <fenghua.yu@intel.com>
+Cc: <stable@vger.kernel.org>
+Link: https://lkml.kernel.org/r/20200915094143.79181-1-ldufour@linux.ibm.com
+Link: https://lkml.kernel.org/r/20200915132624.9723-1-ldufour@linux.ibm.com
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/ia64/mm/init.c | 6 +++---
+ include/linux/mm.h | 2 +-
+ include/linux/mmzone.h | 11 ++++++++---
+ mm/memory_hotplug.c | 2 +-
+ mm/page_alloc.c | 11 ++++++-----
+ 5 files changed, 19 insertions(+), 13 deletions(-)
+
+--- a/arch/ia64/mm/init.c
++++ b/arch/ia64/mm/init.c
+@@ -499,7 +499,7 @@ virtual_memmap_init(u64 start, u64 end,
+ if (map_start < map_end)
+ memmap_init_zone((unsigned long)(map_end - map_start),
+ args->nid, args->zone, page_to_pfn(map_start),
+- MEMMAP_EARLY, NULL);
++ MEMINIT_EARLY, NULL);
+ return 0;
+ }
+
+@@ -508,8 +508,8 @@ memmap_init (unsigned long size, int nid
+ unsigned long start_pfn)
+ {
+ if (!vmem_map) {
+- memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY,
+- NULL);
++ memmap_init_zone(size, nid, zone, start_pfn,
++ MEMINIT_EARLY, NULL);
+ } else {
+ struct page *start;
+ struct memmap_init_callback_data args;
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -2179,7 +2179,7 @@ static inline void zero_resv_unavail(voi
+
+ extern void set_dma_reserve(unsigned long new_dma_reserve);
+ extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long,
+- enum memmap_context, struct vmem_altmap *);
++ enum meminit_context, struct vmem_altmap *);
+ extern void setup_per_zone_wmarks(void);
+ extern int __meminit init_per_zone_wmark_min(void);
+ extern void mem_init(void);
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -759,10 +759,15 @@ bool zone_watermark_ok(struct zone *z, u
+ unsigned int alloc_flags);
+ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
+ unsigned long mark, int classzone_idx);
+-enum memmap_context {
+- MEMMAP_EARLY,
+- MEMMAP_HOTPLUG,
++/*
++ * Memory initialization context, use to differentiate memory added by
++ * the platform statically or via memory hotplug interface.
++ */
++enum meminit_context {
++ MEMINIT_EARLY,
++ MEMINIT_HOTPLUG,
+ };
++
+ extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
+ unsigned long size);
+
+--- a/mm/memory_hotplug.c
++++ b/mm/memory_hotplug.c
+@@ -733,7 +733,7 @@ void __ref move_pfn_range_to_zone(struct
+ * are reserved so nobody should be touching them so we should be safe
+ */
+ memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn,
+- MEMMAP_HOTPLUG, altmap);
++ MEMINIT_HOTPLUG, altmap);
+
+ set_zone_contiguous(zone);
+ }
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -5480,7 +5480,7 @@ void __ref build_all_zonelists(pg_data_t
+ * done. Non-atomic initialization, single-pass.
+ */
+ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+- unsigned long start_pfn, enum memmap_context context,
++ unsigned long start_pfn, enum meminit_context context,
+ struct vmem_altmap *altmap)
+ {
+ unsigned long end_pfn = start_pfn + size;
+@@ -5507,7 +5507,7 @@ void __meminit memmap_init_zone(unsigned
+ * There can be holes in boot-time mem_map[]s handed to this
+ * function. They do not exist on hotplugged memory.
+ */
+- if (context != MEMMAP_EARLY)
++ if (context != MEMINIT_EARLY)
+ goto not_early;
+
+ if (!early_pfn_valid(pfn))
+@@ -5542,7 +5542,7 @@ void __meminit memmap_init_zone(unsigned
+ not_early:
+ page = pfn_to_page(pfn);
+ __init_single_page(page, pfn, zone, nid);
+- if (context == MEMMAP_HOTPLUG)
++ if (context == MEMINIT_HOTPLUG)
+ SetPageReserved(page);
+
+ /*
+@@ -5557,7 +5557,7 @@ not_early:
+ * check here not to call set_pageblock_migratetype() against
+ * pfn out of zone.
+ *
+- * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
++ * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
+ * because this is done early in sparse_add_one_section
+ */
+ if (!(pfn & (pageblock_nr_pages - 1))) {
+@@ -5578,7 +5578,8 @@ static void __meminit zone_init_free_lis
+
+ #ifndef __HAVE_ARCH_MEMMAP_INIT
+ #define memmap_init(size, nid, zone, start_pfn) \
+- memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY, NULL)
++ memmap_init_zone((size), (nid), (zone), (start_pfn), \
++ MEMINIT_EARLY, NULL)
+ #endif
+
+ static int zone_batchsize(struct zone *zone)