From 217b136493fd2a22f8a93ea034e95aad8e9ad788 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Mon, 17 Apr 2023 21:24:52 -0400 Subject: [PATCH] Fixes for 5.10 Signed-off-by: Sasha Levin --- ...ate-stop-condition-after-timeout-in-.patch | 106 ++++ ...wl-fix-a-couple-of-kernel-doc-issues.patch | 51 ++ ...-update-the-numa-distance-table-for-.patch | 84 ++++ ...-add-a-helper-for-form1-cpu-distance.patch | 101 ++++ ...-add-support-for-form2-associativity.patch | 471 ++++++++++++++++++ ...consolidate-different-numa-distance-.patch | 392 +++++++++++++++ ...rename-min_common_depth-to-primary_d.patch | 158 ++++++ ...rename-type1_affinity-to-form1_affin.patch | 170 +++++++ .../sched-fair-fix-imbalance-overflow.patch | 48 ++ ...calculate-of-avg_load-to-a-better-lo.patch | 50 ++ queue-5.10/series | 13 + ...-caused-by-recursively-holding-work_.patch | 66 +++ ...attaching-when-vid_hdr-offset-equals.patch | 79 +++ ...-remove-__init-for-runtime-functions.patch | 53 ++ 14 files changed, 1842 insertions(+) create mode 100644 queue-5.10/i2c-ocores-generate-stop-condition-after-timeout-in-.patch create mode 100644 queue-5.10/mtd-ubi-wl-fix-a-couple-of-kernel-doc-issues.patch create mode 100644 queue-5.10/powerpc-papr_scm-update-the-numa-distance-table-for-.patch create mode 100644 queue-5.10/powerpc-pseries-add-a-helper-for-form1-cpu-distance.patch create mode 100644 queue-5.10/powerpc-pseries-add-support-for-form2-associativity.patch create mode 100644 queue-5.10/powerpc-pseries-consolidate-different-numa-distance-.patch create mode 100644 queue-5.10/powerpc-pseries-rename-min_common_depth-to-primary_d.patch create mode 100644 queue-5.10/powerpc-pseries-rename-type1_affinity-to-form1_affin.patch create mode 100644 queue-5.10/sched-fair-fix-imbalance-overflow.patch create mode 100644 queue-5.10/sched-fair-move-calculate-of-avg_load-to-a-better-lo.patch create mode 100644 queue-5.10/ubi-fix-deadlock-caused-by-recursively-holding-work_.patch create mode 100644 queue-5.10/ubi-fix-failure-attaching-when-vid_hdr-offset-equals.patch create mode 100644 queue-5.10/x86-rtc-remove-__init-for-runtime-functions.patch diff --git a/queue-5.10/i2c-ocores-generate-stop-condition-after-timeout-in-.patch b/queue-5.10/i2c-ocores-generate-stop-condition-after-timeout-in-.patch new file mode 100644 index 00000000000..3deafdbb682 --- /dev/null +++ b/queue-5.10/i2c-ocores-generate-stop-condition-after-timeout-in-.patch @@ -0,0 +1,106 @@ +From fcae622444dca8b418e9de3b0ac59e78e9435025 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 13 Apr 2023 11:37:37 +0200 +Subject: i2c: ocores: generate stop condition after timeout in polling mode + +From: Gregor Herburger + +[ Upstream commit f8160d3b35fc94491bb0cb974dbda310ef96c0e2 ] + +In polling mode, no stop condition is generated after a timeout. This +causes SCL to remain low and thereby block the bus. If this happens +during a transfer it can cause slaves to misinterpret the subsequent +transfer and return wrong values. + +To solve this, pass the ETIMEDOUT error up from ocores_process_polling() +instead of setting STATE_ERROR directly. The caller is adjusted to call +ocores_process_timeout() on error both in polling and in IRQ mode, which +will set STATE_ERROR and generate a stop condition. + +Fixes: 69c8c0c0efa8 ("i2c: ocores: add polling interface") +Signed-off-by: Gregor Herburger +Signed-off-by: Matthias Schiffer +Acked-by: Peter Korsgaard +Reviewed-by: Andrew Lunn +Reviewed-by: Federico Vaga +Signed-off-by: Wolfram Sang +Signed-off-by: Sasha Levin +--- + drivers/i2c/busses/i2c-ocores.c | 35 ++++++++++++++++++--------------- + 1 file changed, 19 insertions(+), 16 deletions(-) + +diff --git a/drivers/i2c/busses/i2c-ocores.c b/drivers/i2c/busses/i2c-ocores.c +index f5fc75b65a194..71e26aa6bd8ff 100644 +--- a/drivers/i2c/busses/i2c-ocores.c ++++ b/drivers/i2c/busses/i2c-ocores.c +@@ -343,18 +343,18 @@ static int ocores_poll_wait(struct ocores_i2c *i2c) + * ocores_isr(), we just add our polling code around it. + * + * It can run in atomic context ++ * ++ * Return: 0 on success, -ETIMEDOUT on timeout + */ +-static void ocores_process_polling(struct ocores_i2c *i2c) ++static int ocores_process_polling(struct ocores_i2c *i2c) + { +- while (1) { +- irqreturn_t ret; +- int err; ++ irqreturn_t ret; ++ int err = 0; + ++ while (1) { + err = ocores_poll_wait(i2c); +- if (err) { +- i2c->state = STATE_ERROR; ++ if (err) + break; /* timeout */ +- } + + ret = ocores_isr(-1, i2c); + if (ret == IRQ_NONE) +@@ -365,13 +365,15 @@ static void ocores_process_polling(struct ocores_i2c *i2c) + break; + } + } ++ ++ return err; + } + + static int ocores_xfer_core(struct ocores_i2c *i2c, + struct i2c_msg *msgs, int num, + bool polling) + { +- int ret; ++ int ret = 0; + u8 ctrl; + + ctrl = oc_getreg(i2c, OCI2C_CONTROL); +@@ -389,15 +391,16 @@ static int ocores_xfer_core(struct ocores_i2c *i2c, + oc_setreg(i2c, OCI2C_CMD, OCI2C_CMD_START); + + if (polling) { +- ocores_process_polling(i2c); ++ ret = ocores_process_polling(i2c); + } else { +- ret = wait_event_timeout(i2c->wait, +- (i2c->state == STATE_ERROR) || +- (i2c->state == STATE_DONE), HZ); +- if (ret == 0) { +- ocores_process_timeout(i2c); +- return -ETIMEDOUT; +- } ++ if (wait_event_timeout(i2c->wait, ++ (i2c->state == STATE_ERROR) || ++ (i2c->state == STATE_DONE), HZ) == 0) ++ ret = -ETIMEDOUT; ++ } ++ if (ret) { ++ ocores_process_timeout(i2c); ++ return ret; + } + + return (i2c->state == STATE_DONE) ? num : -EIO; +-- +2.39.2 + diff --git a/queue-5.10/mtd-ubi-wl-fix-a-couple-of-kernel-doc-issues.patch b/queue-5.10/mtd-ubi-wl-fix-a-couple-of-kernel-doc-issues.patch new file mode 100644 index 00000000000..8b3f28d5514 --- /dev/null +++ b/queue-5.10/mtd-ubi-wl-fix-a-couple-of-kernel-doc-issues.patch @@ -0,0 +1,51 @@ +From 10c187783fcd90715eb08de38c99757762291e08 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 9 Nov 2020 18:21:55 +0000 +Subject: mtd: ubi: wl: Fix a couple of kernel-doc issues + +From: Lee Jones + +[ Upstream commit ab4e4de9fd8b469823a645f05f2c142e9270b012 ] + +Fixes the following W=1 kernel build warning(s): + + drivers/mtd/ubi/wl.c:584: warning: Function parameter or member 'nested' not described in 'schedule_erase' + drivers/mtd/ubi/wl.c:1075: warning: Excess function parameter 'shutdown' description in '__erase_worker' + +Cc: Richard Weinberger +Cc: Miquel Raynal +Cc: Vignesh Raghavendra +Cc: linux-mtd@lists.infradead.org +Signed-off-by: Lee Jones +Signed-off-by: Miquel Raynal +Link: https://lore.kernel.org/linux-mtd/20201109182206.3037326-13-lee.jones@linaro.org +Stable-dep-of: f773f0a331d6 ("ubi: Fix deadlock caused by recursively holding work_sem") +Signed-off-by: Sasha Levin +--- + drivers/mtd/ubi/wl.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c +index 6da09263e0b9f..2ee0e60c43c2e 100644 +--- a/drivers/mtd/ubi/wl.c ++++ b/drivers/mtd/ubi/wl.c +@@ -575,6 +575,7 @@ static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk, + * @vol_id: the volume ID that last used this PEB + * @lnum: the last used logical eraseblock number for the PEB + * @torture: if the physical eraseblock has to be tortured ++ * @nested: denotes whether the work_sem is already held in read mode + * + * This function returns zero in case of success and a %-ENOMEM in case of + * failure. +@@ -1066,8 +1067,6 @@ static int ensure_wear_leveling(struct ubi_device *ubi, int nested) + * __erase_worker - physical eraseblock erase worker function. + * @ubi: UBI device description object + * @wl_wrk: the work object +- * @shutdown: non-zero if the worker has to free memory and exit +- * because the WL sub-system is shutting down + * + * This function erases a physical eraseblock and perform torture testing if + * needed. It also takes care about marking the physical eraseblock bad if +-- +2.39.2 + diff --git a/queue-5.10/powerpc-papr_scm-update-the-numa-distance-table-for-.patch b/queue-5.10/powerpc-papr_scm-update-the-numa-distance-table-for-.patch new file mode 100644 index 00000000000..267ef2d3e62 --- /dev/null +++ b/queue-5.10/powerpc-papr_scm-update-the-numa-distance-table-for-.patch @@ -0,0 +1,84 @@ +From 9143366ddb48aa1f932bfeec6ebfdaa968831ee4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 4 Apr 2023 09:44:33 +0530 +Subject: powerpc/papr_scm: Update the NUMA distance table for the target node + +From: Aneesh Kumar K.V + +[ Upstream commit b277fc793daf258877b4c0744b52f69d6e6ba22e ] + +Platform device helper routines won't update the NUMA distance table +while creating a platform device, even if the device is present on a +NUMA node that doesn't have memory or CPU. This is especially true for +pmem devices. If the target node of the pmem device is not online, we +find the nearest online node to the device and associate the pmem device +with that online node. To find the nearest online node, we should have +the numa distance table updated correctly. Update the distance +information during the device probe. + +For a papr scm device on NUMA node 3 distance_lookup_table value for +distance_ref_points_depth = 2 before and after fix is below: + +Before fix: + node 3 distance depth 0 - 0 + node 3 distance depth 1 - 0 + node 4 distance depth 0 - 4 + node 4 distance depth 1 - 2 + node 5 distance depth 0 - 5 + node 5 distance depth 1 - 1 + +After fix + node 3 distance depth 0 - 3 + node 3 distance depth 1 - 1 + node 4 distance depth 0 - 4 + node 4 distance depth 1 - 2 + node 5 distance depth 0 - 5 + node 5 distance depth 1 - 1 + +Without the fix, the nearest numa node to the pmem device (NUMA node 3) +will be picked as 4. After the fix, we get the correct numa node which +is 5. + +Fixes: da1115fdbd6e ("powerpc/nvdimm: Pick nearby online node if the device node is not online") +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: Michael Ellerman +Link: https://msgid.link/20230404041433.1781804-1-aneesh.kumar@linux.ibm.com +Signed-off-by: Sasha Levin +--- + arch/powerpc/mm/numa.c | 1 + + arch/powerpc/platforms/pseries/papr_scm.c | 7 +++++++ + 2 files changed, 8 insertions(+) + +diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c +index cfc170935a58b..ce8569e16f0c4 100644 +--- a/arch/powerpc/mm/numa.c ++++ b/arch/powerpc/mm/numa.c +@@ -372,6 +372,7 @@ void update_numa_distance(struct device_node *node) + WARN(numa_distance_table[nid][nid] == -1, + "NUMA distance details for node %d not provided\n", nid); + } ++EXPORT_SYMBOL_GPL(update_numa_distance); + + /* + * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN} +diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c +index 057acbb9116dd..e3b7698b4762c 100644 +--- a/arch/powerpc/platforms/pseries/papr_scm.c ++++ b/arch/powerpc/platforms/pseries/papr_scm.c +@@ -1079,6 +1079,13 @@ static int papr_scm_probe(struct platform_device *pdev) + return -ENODEV; + } + ++ /* ++ * open firmware platform device create won't update the NUMA ++ * distance table. For PAPR SCM devices we use numa_map_to_online_node() ++ * to find the nearest online NUMA node and that requires correct ++ * distance table information. ++ */ ++ update_numa_distance(dn); + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) +-- +2.39.2 + diff --git a/queue-5.10/powerpc-pseries-add-a-helper-for-form1-cpu-distance.patch b/queue-5.10/powerpc-pseries-add-a-helper-for-form1-cpu-distance.patch new file mode 100644 index 00000000000..d11bdc9a1b3 --- /dev/null +++ b/queue-5.10/powerpc-pseries-add-a-helper-for-form1-cpu-distance.patch @@ -0,0 +1,101 @@ +From 6e4a8a54e0e81098fadb33328237141739504aa6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 12 Aug 2021 18:52:22 +0530 +Subject: powerpc/pseries: Add a helper for form1 cpu distance + +From: Aneesh Kumar K.V + +[ Upstream commit ef31cb83d19c4c589d650747cd5a7e502be9f665 ] + +This helper is only used with the dispatch trace log collection. +A later patch will add Form2 affinity support and this change helps +in keeping that simpler. Also add a comment explaining we don't expect +the code to be called with FORM0 + +Signed-off-by: Aneesh Kumar K.V +Reviewed-by: David Gibson +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20210812132223.225214-5-aneesh.kumar@linux.ibm.com +Stable-dep-of: b277fc793daf ("powerpc/papr_scm: Update the NUMA distance table for the target node") +Signed-off-by: Sasha Levin +--- + arch/powerpc/include/asm/topology.h | 4 ++-- + arch/powerpc/mm/numa.c | 10 +++++++++- + arch/powerpc/platforms/pseries/lpar.c | 4 ++-- + 3 files changed, 13 insertions(+), 5 deletions(-) + +diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h +index 1604920d8d2de..b239ef589ae06 100644 +--- a/arch/powerpc/include/asm/topology.h ++++ b/arch/powerpc/include/asm/topology.h +@@ -36,7 +36,7 @@ static inline int pcibus_to_node(struct pci_bus *bus) + cpu_all_mask : \ + cpumask_of_node(pcibus_to_node(bus))) + +-extern int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc); ++int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc); + extern int __node_distance(int, int); + #define node_distance(a, b) __node_distance(a, b) + +@@ -84,7 +84,7 @@ static inline void sysfs_remove_device_from_node(struct device *dev, + + static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node) {} + +-static inline int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) ++static inline int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) + { + return 0; + } +diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c +index e61593ae25c9e..010476abec344 100644 +--- a/arch/powerpc/mm/numa.c ++++ b/arch/powerpc/mm/numa.c +@@ -166,7 +166,7 @@ static void unmap_cpu_from_node(unsigned long cpu) + } + #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ + +-int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) ++static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) + { + int dist = 0; + +@@ -182,6 +182,14 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) + return dist; + } + ++int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) ++{ ++ /* We should not get called with FORM0 */ ++ VM_WARN_ON(affinity_form == FORM0_AFFINITY); ++ ++ return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc); ++} ++ + /* must hold reference to node during call */ + static const __be32 *of_get_associativity(struct device_node *dev) + { +diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c +index 115d196560b8b..28396a7e77d6f 100644 +--- a/arch/powerpc/platforms/pseries/lpar.c ++++ b/arch/powerpc/platforms/pseries/lpar.c +@@ -261,7 +261,7 @@ static int cpu_relative_dispatch_distance(int last_disp_cpu, int cur_disp_cpu) + if (!last_disp_cpu_assoc || !cur_disp_cpu_assoc) + return -EIO; + +- return cpu_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc); ++ return cpu_relative_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc); + } + + static int cpu_home_node_dispatch_distance(int disp_cpu) +@@ -281,7 +281,7 @@ static int cpu_home_node_dispatch_distance(int disp_cpu) + if (!disp_cpu_assoc || !vcpu_assoc) + return -EIO; + +- return cpu_distance(disp_cpu_assoc, vcpu_assoc); ++ return cpu_relative_distance(disp_cpu_assoc, vcpu_assoc); + } + + static void update_vcpu_disp_stat(int disp_cpu) +-- +2.39.2 + diff --git a/queue-5.10/powerpc-pseries-add-support-for-form2-associativity.patch b/queue-5.10/powerpc-pseries-add-support-for-form2-associativity.patch new file mode 100644 index 00000000000..a7a0c5ad00b --- /dev/null +++ b/queue-5.10/powerpc-pseries-add-support-for-form2-associativity.patch @@ -0,0 +1,471 @@ +From f242c177c86e9ce6dfea8000bcf601494f3fdeec Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 12 Aug 2021 18:52:23 +0530 +Subject: powerpc/pseries: Add support for FORM2 associativity + +From: Aneesh Kumar K.V + +[ Upstream commit 1c6b5a7e74052768977855f95d6b8812f6e7772c ] + +PAPR interface currently supports two different ways of communicating resource +grouping details to the OS. These are referred to as Form 0 and Form 1 +associativity grouping. Form 0 is the older format and is now considered +deprecated. This patch adds another resource grouping named FORM2. + +Signed-off-by: Daniel Henrique Barboza +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20210812132223.225214-6-aneesh.kumar@linux.ibm.com +Stable-dep-of: b277fc793daf ("powerpc/papr_scm: Update the NUMA distance table for the target node") +Signed-off-by: Sasha Levin +--- + Documentation/powerpc/associativity.rst | 104 ++++++++++++ + arch/powerpc/include/asm/firmware.h | 3 +- + arch/powerpc/include/asm/prom.h | 1 + + arch/powerpc/kernel/prom_init.c | 3 +- + arch/powerpc/mm/numa.c | 187 ++++++++++++++++++---- + arch/powerpc/platforms/pseries/firmware.c | 1 + + 6 files changed, 262 insertions(+), 37 deletions(-) + create mode 100644 Documentation/powerpc/associativity.rst + +diff --git a/Documentation/powerpc/associativity.rst b/Documentation/powerpc/associativity.rst +new file mode 100644 +index 0000000000000..07e7dd3d6c87e +--- /dev/null ++++ b/Documentation/powerpc/associativity.rst +@@ -0,0 +1,104 @@ ++============================ ++NUMA resource associativity ++============================= ++ ++Associativity represents the groupings of the various platform resources into ++domains of substantially similar mean performance relative to resources outside ++of that domain. Resources subsets of a given domain that exhibit better ++performance relative to each other than relative to other resources subsets ++are represented as being members of a sub-grouping domain. This performance ++characteristic is presented in terms of NUMA node distance within the Linux kernel. ++From the platform view, these groups are also referred to as domains. ++ ++PAPR interface currently supports different ways of communicating these resource ++grouping details to the OS. These are referred to as Form 0, Form 1 and Form2 ++associativity grouping. Form 0 is the oldest format and is now considered deprecated. ++ ++Hypervisor indicates the type/form of associativity used via "ibm,architecture-vec-5 property". ++Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of Form 0 or Form 1. ++A value of 1 indicates the usage of Form 1 associativity. For Form 2 associativity ++bit 2 of byte 5 in the "ibm,architecture-vec-5" property is used. ++ ++Form 0 ++----- ++Form 0 associativity supports only two NUMA distances (LOCAL and REMOTE). ++ ++Form 1 ++----- ++With Form 1 a combination of ibm,associativity-reference-points, and ibm,associativity ++device tree properties are used to determine the NUMA distance between resource groups/domains. ++ ++The “ibm,associativity” property contains a list of one or more numbers (domainID) ++representing the resource’s platform grouping domains. ++ ++The “ibm,associativity-reference-points” property contains a list of one or more numbers ++(domainID index) that represents the 1 based ordinal in the associativity lists. ++The list of domainID indexes represents an increasing hierarchy of resource grouping. ++ ++ex: ++{ primary domainID index, secondary domainID index, tertiary domainID index.. } ++ ++Linux kernel uses the domainID at the primary domainID index as the NUMA node id. ++Linux kernel computes NUMA distance between two domains by recursively comparing ++if they belong to the same higher-level domains. For mismatch at every higher ++level of the resource group, the kernel doubles the NUMA distance between the ++comparing domains. ++ ++Form 2 ++------- ++Form 2 associativity format adds separate device tree properties representing NUMA node distance ++thereby making the node distance computation flexible. Form 2 also allows flexible primary ++domain numbering. With numa distance computation now detached from the index value in ++"ibm,associativity-reference-points" property, Form 2 allows a large number of primary domain ++ids at the same domainID index representing resource groups of different performance/latency ++characteristics. ++ ++Hypervisor indicates the usage of FORM2 associativity using bit 2 of byte 5 in the ++"ibm,architecture-vec-5" property. ++ ++"ibm,numa-lookup-index-table" property contains a list of one or more numbers representing ++the domainIDs present in the system. The offset of the domainID in this property is ++used as an index while computing numa distance information via "ibm,numa-distance-table". ++ ++prop-encoded-array: The number N of the domainIDs encoded as with encode-int, followed by ++N domainID encoded as with encode-int ++ ++For ex: ++"ibm,numa-lookup-index-table" = {4, 0, 8, 250, 252}. The offset of domainID 8 (2) is used when ++computing the distance of domain 8 from other domains present in the system. For the rest of ++this document, this offset will be referred to as domain distance offset. ++ ++"ibm,numa-distance-table" property contains a list of one or more numbers representing the NUMA ++distance between resource groups/domains present in the system. ++ ++prop-encoded-array: The number N of the distance values encoded as with encode-int, followed by ++N distance values encoded as with encode-bytes. The max distance value we could encode is 255. ++The number N must be equal to the square of m where m is the number of domainIDs in the ++numa-lookup-index-table. ++ ++For ex: ++ibm,numa-lookup-index-table = <3 0 8 40>; ++ibm,numa-distace-table = <9>, /bits/ 8 < 10 20 80 ++ 20 10 160 ++ 80 160 10>; ++ | 0 8 40 ++--|------------ ++ | ++0 | 10 20 80 ++ | ++8 | 20 10 160 ++ | ++40| 80 160 10 ++ ++A possible "ibm,associativity" property for resources in node 0, 8 and 40 ++ ++{ 3, 6, 7, 0 } ++{ 3, 6, 9, 8 } ++{ 3, 6, 7, 40} ++ ++With "ibm,associativity-reference-points" { 0x3 } ++ ++"ibm,lookup-index-table" helps in having a compact representation of distance matrix. ++Since domainID can be sparse, the matrix of distances can also be effectively sparse. ++With "ibm,lookup-index-table" we can achieve a compact representation of ++distance information. +diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h +index 0cf648d829f15..89a31f1c7b118 100644 +--- a/arch/powerpc/include/asm/firmware.h ++++ b/arch/powerpc/include/asm/firmware.h +@@ -53,6 +53,7 @@ + #define FW_FEATURE_ULTRAVISOR ASM_CONST(0x0000004000000000) + #define FW_FEATURE_STUFF_TCE ASM_CONST(0x0000008000000000) + #define FW_FEATURE_RPT_INVALIDATE ASM_CONST(0x0000010000000000) ++#define FW_FEATURE_FORM2_AFFINITY ASM_CONST(0x0000020000000000) + + #ifndef __ASSEMBLY__ + +@@ -73,7 +74,7 @@ enum { + FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 | + FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE | + FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR | +- FW_FEATURE_RPT_INVALIDATE, ++ FW_FEATURE_RPT_INVALIDATE | FW_FEATURE_FORM2_AFFINITY, + FW_FEATURE_PSERIES_ALWAYS = 0, + FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL | FW_FEATURE_ULTRAVISOR, + FW_FEATURE_POWERNV_ALWAYS = 0, +diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h +index df9fec9d232cb..5c80152e8f188 100644 +--- a/arch/powerpc/include/asm/prom.h ++++ b/arch/powerpc/include/asm/prom.h +@@ -149,6 +149,7 @@ extern int of_read_drc_info_cell(struct property **prop, + #define OV5_XCMO 0x0440 /* Page Coalescing */ + #define OV5_FORM1_AFFINITY 0x0580 /* FORM1 NUMA affinity */ + #define OV5_PRRN 0x0540 /* Platform Resource Reassignment */ ++#define OV5_FORM2_AFFINITY 0x0520 /* Form2 NUMA affinity */ + #define OV5_HP_EVT 0x0604 /* Hot Plug Event support */ + #define OV5_RESIZE_HPT 0x0601 /* Hash Page Table resizing */ + #define OV5_PFO_HW_RNG 0x1180 /* PFO Random Number Generator */ +diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c +index a3bf3587a4162..6f7ad80763159 100644 +--- a/arch/powerpc/kernel/prom_init.c ++++ b/arch/powerpc/kernel/prom_init.c +@@ -1069,7 +1069,8 @@ static const struct ibm_arch_vec ibm_architecture_vec_template __initconst = { + #else + 0, + #endif +- .associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | OV5_FEAT(OV5_PRRN), ++ .associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | OV5_FEAT(OV5_PRRN) | ++ OV5_FEAT(OV5_FORM2_AFFINITY), + .bin_opts = OV5_FEAT(OV5_RESIZE_HPT) | OV5_FEAT(OV5_HP_EVT), + .micro_checkpoint = 0, + .reserved0 = 0, +diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c +index 010476abec344..cfc170935a58b 100644 +--- a/arch/powerpc/mm/numa.c ++++ b/arch/powerpc/mm/numa.c +@@ -56,12 +56,17 @@ static int n_mem_addr_cells, n_mem_size_cells; + + #define FORM0_AFFINITY 0 + #define FORM1_AFFINITY 1 ++#define FORM2_AFFINITY 2 + static int affinity_form; + + #define MAX_DISTANCE_REF_POINTS 4 + static int distance_ref_points_depth; + static const __be32 *distance_ref_points; + static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; ++static int numa_distance_table[MAX_NUMNODES][MAX_NUMNODES] = { ++ [0 ... MAX_NUMNODES - 1] = { [0 ... MAX_NUMNODES - 1] = -1 } ++}; ++static int numa_id_index_table[MAX_NUMNODES] = { [0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE }; + + /* + * Allocate node_to_cpumask_map based on number of available nodes +@@ -166,6 +171,54 @@ static void unmap_cpu_from_node(unsigned long cpu) + } + #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ + ++static int __associativity_to_nid(const __be32 *associativity, ++ int max_array_sz) ++{ ++ int nid; ++ /* ++ * primary_domain_index is 1 based array index. ++ */ ++ int index = primary_domain_index - 1; ++ ++ if (!numa_enabled || index >= max_array_sz) ++ return NUMA_NO_NODE; ++ ++ nid = of_read_number(&associativity[index], 1); ++ ++ /* POWER4 LPAR uses 0xffff as invalid node */ ++ if (nid == 0xffff || nid >= nr_node_ids) ++ nid = NUMA_NO_NODE; ++ return nid; ++} ++/* ++ * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA ++ * info is found. ++ */ ++static int associativity_to_nid(const __be32 *associativity) ++{ ++ int array_sz = of_read_number(associativity, 1); ++ ++ /* Skip the first element in the associativity array */ ++ return __associativity_to_nid((associativity + 1), array_sz); ++} ++ ++static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) ++{ ++ int dist; ++ int node1, node2; ++ ++ node1 = associativity_to_nid(cpu1_assoc); ++ node2 = associativity_to_nid(cpu2_assoc); ++ ++ dist = numa_distance_table[node1][node2]; ++ if (dist <= LOCAL_DISTANCE) ++ return 0; ++ else if (dist <= REMOTE_DISTANCE) ++ return 1; ++ else ++ return 2; ++} ++ + static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) + { + int dist = 0; +@@ -186,8 +239,9 @@ int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) + { + /* We should not get called with FORM0 */ + VM_WARN_ON(affinity_form == FORM0_AFFINITY); +- +- return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc); ++ if (affinity_form == FORM1_AFFINITY) ++ return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc); ++ return __cpu_form2_relative_distance(cpu1_assoc, cpu2_assoc); + } + + /* must hold reference to node during call */ +@@ -201,7 +255,9 @@ int __node_distance(int a, int b) + int i; + int distance = LOCAL_DISTANCE; + +- if (affinity_form == FORM0_AFFINITY) ++ if (affinity_form == FORM2_AFFINITY) ++ return numa_distance_table[a][b]; ++ else if (affinity_form == FORM0_AFFINITY) + return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE); + + for (i = 0; i < distance_ref_points_depth; i++) { +@@ -216,37 +272,6 @@ int __node_distance(int a, int b) + } + EXPORT_SYMBOL(__node_distance); + +-static int __associativity_to_nid(const __be32 *associativity, +- int max_array_sz) +-{ +- int nid; +- /* +- * primary_domain_index is 1 based array index. +- */ +- int index = primary_domain_index - 1; +- +- if (!numa_enabled || index >= max_array_sz) +- return NUMA_NO_NODE; +- +- nid = of_read_number(&associativity[index], 1); +- +- /* POWER4 LPAR uses 0xffff as invalid node */ +- if (nid == 0xffff || nid >= nr_node_ids) +- nid = NUMA_NO_NODE; +- return nid; +-} +-/* +- * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA +- * info is found. +- */ +-static int associativity_to_nid(const __be32 *associativity) +-{ +- int array_sz = of_read_number(associativity, 1); +- +- /* Skip the first element in the associativity array */ +- return __associativity_to_nid((associativity + 1), array_sz); +-} +- + /* Returns the nid associated with the given device tree node, + * or -1 if not found. + */ +@@ -320,6 +345,8 @@ static void initialize_form1_numa_distance(const __be32 *associativity) + */ + void update_numa_distance(struct device_node *node) + { ++ int nid; ++ + if (affinity_form == FORM0_AFFINITY) + return; + else if (affinity_form == FORM1_AFFINITY) { +@@ -332,6 +359,84 @@ void update_numa_distance(struct device_node *node) + initialize_form1_numa_distance(associativity); + return; + } ++ ++ /* FORM2 affinity */ ++ nid = of_node_to_nid_single(node); ++ if (nid == NUMA_NO_NODE) ++ return; ++ ++ /* ++ * With FORM2 we expect NUMA distance of all possible NUMA ++ * nodes to be provided during boot. ++ */ ++ WARN(numa_distance_table[nid][nid] == -1, ++ "NUMA distance details for node %d not provided\n", nid); ++} ++ ++/* ++ * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN} ++ * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6, .... N elements} ++ */ ++static void initialize_form2_numa_distance_lookup_table(void) ++{ ++ int i, j; ++ struct device_node *root; ++ const __u8 *numa_dist_table; ++ const __be32 *numa_lookup_index; ++ int numa_dist_table_length; ++ int max_numa_index, distance_index; ++ ++ if (firmware_has_feature(FW_FEATURE_OPAL)) ++ root = of_find_node_by_path("/ibm,opal"); ++ else ++ root = of_find_node_by_path("/rtas"); ++ if (!root) ++ root = of_find_node_by_path("/"); ++ ++ numa_lookup_index = of_get_property(root, "ibm,numa-lookup-index-table", NULL); ++ max_numa_index = of_read_number(&numa_lookup_index[0], 1); ++ ++ /* first element of the array is the size and is encode-int */ ++ numa_dist_table = of_get_property(root, "ibm,numa-distance-table", NULL); ++ numa_dist_table_length = of_read_number((const __be32 *)&numa_dist_table[0], 1); ++ /* Skip the size which is encoded int */ ++ numa_dist_table += sizeof(__be32); ++ ++ pr_debug("numa_dist_table_len = %d, numa_dist_indexes_len = %d\n", ++ numa_dist_table_length, max_numa_index); ++ ++ for (i = 0; i < max_numa_index; i++) ++ /* +1 skip the max_numa_index in the property */ ++ numa_id_index_table[i] = of_read_number(&numa_lookup_index[i + 1], 1); ++ ++ ++ if (numa_dist_table_length != max_numa_index * max_numa_index) { ++ WARN(1, "Wrong NUMA distance information\n"); ++ /* consider everybody else just remote. */ ++ for (i = 0; i < max_numa_index; i++) { ++ for (j = 0; j < max_numa_index; j++) { ++ int nodeA = numa_id_index_table[i]; ++ int nodeB = numa_id_index_table[j]; ++ ++ if (nodeA == nodeB) ++ numa_distance_table[nodeA][nodeB] = LOCAL_DISTANCE; ++ else ++ numa_distance_table[nodeA][nodeB] = REMOTE_DISTANCE; ++ } ++ } ++ } ++ ++ distance_index = 0; ++ for (i = 0; i < max_numa_index; i++) { ++ for (j = 0; j < max_numa_index; j++) { ++ int nodeA = numa_id_index_table[i]; ++ int nodeB = numa_id_index_table[j]; ++ ++ numa_distance_table[nodeA][nodeB] = numa_dist_table[distance_index++]; ++ pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, numa_distance_table[nodeA][nodeB]); ++ } ++ } ++ of_node_put(root); + } + + static int __init find_primary_domain_index(void) +@@ -344,6 +449,9 @@ static int __init find_primary_domain_index(void) + */ + if (firmware_has_feature(FW_FEATURE_OPAL)) { + affinity_form = FORM1_AFFINITY; ++ } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) { ++ dbg("Using form 2 affinity\n"); ++ affinity_form = FORM2_AFFINITY; + } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) { + dbg("Using form 1 affinity\n"); + affinity_form = FORM1_AFFINITY; +@@ -388,9 +496,12 @@ static int __init find_primary_domain_index(void) + + index = of_read_number(&distance_ref_points[1], 1); + } else { ++ /* ++ * Both FORM1 and FORM2 affinity find the primary domain details ++ * at the same offset. ++ */ + index = of_read_number(distance_ref_points, 1); + } +- + /* + * Warn and cap if the hardware supports more than + * MAX_DISTANCE_REF_POINTS domains. +@@ -819,6 +930,12 @@ static int __init parse_numa_properties(void) + + dbg("NUMA associativity depth for CPU/Memory: %d\n", primary_domain_index); + ++ /* ++ * If it is FORM2 initialize the distance table here. ++ */ ++ if (affinity_form == FORM2_AFFINITY) ++ initialize_form2_numa_distance_lookup_table(); ++ + /* + * Even though we connect cpus to numa domains later in SMP + * init, we need to know the node ids now. This is because +diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c +index 5d4c2bc20bbab..f162156b7b68d 100644 +--- a/arch/powerpc/platforms/pseries/firmware.c ++++ b/arch/powerpc/platforms/pseries/firmware.c +@@ -123,6 +123,7 @@ vec5_fw_features_table[] = { + {FW_FEATURE_PRRN, OV5_PRRN}, + {FW_FEATURE_DRMEM_V2, OV5_DRMEM_V2}, + {FW_FEATURE_DRC_INFO, OV5_DRC_INFO}, ++ {FW_FEATURE_FORM2_AFFINITY, OV5_FORM2_AFFINITY}, + }; + + static void __init fw_vec5_feature_init(const char *vec5, unsigned long len) +-- +2.39.2 + diff --git a/queue-5.10/powerpc-pseries-consolidate-different-numa-distance-.patch b/queue-5.10/powerpc-pseries-consolidate-different-numa-distance-.patch new file mode 100644 index 00000000000..97d94e46bdb --- /dev/null +++ b/queue-5.10/powerpc-pseries-consolidate-different-numa-distance-.patch @@ -0,0 +1,392 @@ +From 4b6bf61e15cd9ede4d014c47e0f3c90140818c8b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 12 Aug 2021 18:52:21 +0530 +Subject: powerpc/pseries: Consolidate different NUMA distance update code + paths + +From: Aneesh Kumar K.V + +[ Upstream commit 8ddc6448ec5a5ef50eaa581a7dec0e12a02850ff ] + +The associativity details of the newly added resourced are collected from +the hypervisor via "ibm,configure-connector" rtas call. Update the numa +distance details of the newly added numa node after the above call. + +Instead of updating NUMA distance every time we lookup a node id +from the associativity property, add helpers that can be used +during boot which does this only once. Also remove the distance +update from node id lookup helpers. + +Currently, we duplicate parsing code for ibm,associativity and +ibm,associativity-lookup-arrays in the kernel. The associativity array provided +by these device tree properties are very similar and hence can use +a helper to parse the node id and numa distance details. + +Signed-off-by: Aneesh Kumar K.V +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20210812132223.225214-4-aneesh.kumar@linux.ibm.com +Stable-dep-of: b277fc793daf ("powerpc/papr_scm: Update the NUMA distance table for the target node") +Signed-off-by: Sasha Levin +--- + arch/powerpc/include/asm/topology.h | 2 + + arch/powerpc/mm/numa.c | 212 +++++++++++++----- + arch/powerpc/platforms/pseries/hotplug-cpu.c | 2 + + .../platforms/pseries/hotplug-memory.c | 2 + + 4 files changed, 161 insertions(+), 57 deletions(-) + +diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h +index 3beeb030cd78e..1604920d8d2de 100644 +--- a/arch/powerpc/include/asm/topology.h ++++ b/arch/powerpc/include/asm/topology.h +@@ -64,6 +64,7 @@ static inline int early_cpu_to_node(int cpu) + } + + int of_drconf_to_nid_single(struct drmem_lmb *lmb); ++void update_numa_distance(struct device_node *node); + + #else + +@@ -93,6 +94,7 @@ static inline int of_drconf_to_nid_single(struct drmem_lmb *lmb) + return first_online_node; + } + ++static inline void update_numa_distance(struct device_node *node) {} + #endif /* CONFIG_NUMA */ + + #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR) +diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c +index 415cd3d258ff8..e61593ae25c9e 100644 +--- a/arch/powerpc/mm/numa.c ++++ b/arch/powerpc/mm/numa.c +@@ -208,50 +208,35 @@ int __node_distance(int a, int b) + } + EXPORT_SYMBOL(__node_distance); + +-static void initialize_distance_lookup_table(int nid, +- const __be32 *associativity) ++static int __associativity_to_nid(const __be32 *associativity, ++ int max_array_sz) + { +- int i; ++ int nid; ++ /* ++ * primary_domain_index is 1 based array index. ++ */ ++ int index = primary_domain_index - 1; + +- if (affinity_form != FORM1_AFFINITY) +- return; ++ if (!numa_enabled || index >= max_array_sz) ++ return NUMA_NO_NODE; + +- for (i = 0; i < distance_ref_points_depth; i++) { +- const __be32 *entry; ++ nid = of_read_number(&associativity[index], 1); + +- entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1]; +- distance_lookup_table[nid][i] = of_read_number(entry, 1); +- } ++ /* POWER4 LPAR uses 0xffff as invalid node */ ++ if (nid == 0xffff || nid >= nr_node_ids) ++ nid = NUMA_NO_NODE; ++ return nid; + } +- + /* + * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA + * info is found. + */ + static int associativity_to_nid(const __be32 *associativity) + { +- int nid = NUMA_NO_NODE; +- +- if (!numa_enabled) +- goto out; +- +- if (of_read_number(associativity, 1) >= primary_domain_index) +- nid = of_read_number(&associativity[primary_domain_index], 1); +- +- /* POWER4 LPAR uses 0xffff as invalid node */ +- if (nid == 0xffff || nid >= nr_node_ids) +- nid = NUMA_NO_NODE; +- +- if (nid > 0 && +- of_read_number(associativity, 1) >= distance_ref_points_depth) { +- /* +- * Skip the length field and send start of associativity array +- */ +- initialize_distance_lookup_table(nid, associativity + 1); +- } ++ int array_sz = of_read_number(associativity, 1); + +-out: +- return nid; ++ /* Skip the first element in the associativity array */ ++ return __associativity_to_nid((associativity + 1), array_sz); + } + + /* Returns the nid associated with the given device tree node, +@@ -287,6 +272,60 @@ int of_node_to_nid(struct device_node *device) + } + EXPORT_SYMBOL(of_node_to_nid); + ++static void __initialize_form1_numa_distance(const __be32 *associativity, ++ int max_array_sz) ++{ ++ int i, nid; ++ ++ if (affinity_form != FORM1_AFFINITY) ++ return; ++ ++ nid = __associativity_to_nid(associativity, max_array_sz); ++ if (nid != NUMA_NO_NODE) { ++ for (i = 0; i < distance_ref_points_depth; i++) { ++ const __be32 *entry; ++ int index = be32_to_cpu(distance_ref_points[i]) - 1; ++ ++ /* ++ * broken hierarchy, return with broken distance table ++ */ ++ if (WARN(index >= max_array_sz, "Broken ibm,associativity property")) ++ return; ++ ++ entry = &associativity[index]; ++ distance_lookup_table[nid][i] = of_read_number(entry, 1); ++ } ++ } ++} ++ ++static void initialize_form1_numa_distance(const __be32 *associativity) ++{ ++ int array_sz; ++ ++ array_sz = of_read_number(associativity, 1); ++ /* Skip the first element in the associativity array */ ++ __initialize_form1_numa_distance(associativity + 1, array_sz); ++} ++ ++/* ++ * Used to update distance information w.r.t newly added node. ++ */ ++void update_numa_distance(struct device_node *node) ++{ ++ if (affinity_form == FORM0_AFFINITY) ++ return; ++ else if (affinity_form == FORM1_AFFINITY) { ++ const __be32 *associativity; ++ ++ associativity = of_get_associativity(node); ++ if (!associativity) ++ return; ++ ++ initialize_form1_numa_distance(associativity); ++ return; ++ } ++} ++ + static int __init find_primary_domain_index(void) + { + int index; +@@ -433,6 +472,38 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa) + return 0; + } + ++static int get_nid_and_numa_distance(struct drmem_lmb *lmb) ++{ ++ struct assoc_arrays aa = { .arrays = NULL }; ++ int default_nid = NUMA_NO_NODE; ++ int nid = default_nid; ++ int rc, index; ++ ++ if ((primary_domain_index < 0) || !numa_enabled) ++ return default_nid; ++ ++ rc = of_get_assoc_arrays(&aa); ++ if (rc) ++ return default_nid; ++ ++ if (primary_domain_index <= aa.array_sz && ++ !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { ++ const __be32 *associativity; ++ ++ index = lmb->aa_index * aa.array_sz; ++ associativity = &aa.arrays[index]; ++ nid = __associativity_to_nid(associativity, aa.array_sz); ++ if (nid > 0 && affinity_form == FORM1_AFFINITY) { ++ /* ++ * lookup array associativity entries have ++ * no length of the array as the first element. ++ */ ++ __initialize_form1_numa_distance(associativity, aa.array_sz); ++ } ++ } ++ return nid; ++} ++ + /* + * This is like of_node_to_nid_single() for memory represented in the + * ibm,dynamic-reconfiguration-memory node. +@@ -453,26 +524,19 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb) + + if (primary_domain_index <= aa.array_sz && + !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { +- index = lmb->aa_index * aa.array_sz + primary_domain_index - 1; +- nid = of_read_number(&aa.arrays[index], 1); +- +- if (nid == 0xffff || nid >= nr_node_ids) +- nid = default_nid; ++ const __be32 *associativity; + +- if (nid > 0) { +- index = lmb->aa_index * aa.array_sz; +- initialize_distance_lookup_table(nid, +- &aa.arrays[index]); +- } ++ index = lmb->aa_index * aa.array_sz; ++ associativity = &aa.arrays[index]; ++ nid = __associativity_to_nid(associativity, aa.array_sz); + } +- + return nid; + } + + #ifdef CONFIG_PPC_SPLPAR +-static int vphn_get_nid(long lcpu) ++ ++static int __vphn_get_associativity(long lcpu, __be32 *associativity) + { +- __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; + long rc, hwid; + + /* +@@ -492,12 +556,30 @@ static int vphn_get_nid(long lcpu) + + rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity); + if (rc == H_SUCCESS) +- return associativity_to_nid(associativity); ++ return 0; + } + ++ return -1; ++} ++ ++static int vphn_get_nid(long lcpu) ++{ ++ __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; ++ ++ ++ if (!__vphn_get_associativity(lcpu, associativity)) ++ return associativity_to_nid(associativity); ++ + return NUMA_NO_NODE; ++ + } + #else ++ ++static int __vphn_get_associativity(long lcpu, __be32 *associativity) ++{ ++ return -1; ++} ++ + static int vphn_get_nid(long unused) + { + return NUMA_NO_NODE; +@@ -692,7 +774,7 @@ static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb, + size = read_n_cells(n_mem_size_cells, usm); + } + +- nid = of_drconf_to_nid_single(lmb); ++ nid = get_nid_and_numa_distance(lmb); + fake_numa_create_new_node(((base + size) >> PAGE_SHIFT), + &nid); + node_set_online(nid); +@@ -709,6 +791,7 @@ static int __init parse_numa_properties(void) + struct device_node *memory; + int default_nid = 0; + unsigned long i; ++ const __be32 *associativity; + + if (numa_enabled == 0) { + printk(KERN_WARNING "NUMA disabled by user\n"); +@@ -734,18 +817,30 @@ static int __init parse_numa_properties(void) + * each node to be onlined must have NODE_DATA etc backing it. + */ + for_each_present_cpu(i) { ++ __be32 vphn_assoc[VPHN_ASSOC_BUFSIZE]; + struct device_node *cpu; +- int nid = vphn_get_nid(i); ++ int nid = NUMA_NO_NODE; + +- /* +- * Don't fall back to default_nid yet -- we will plug +- * cpus into nodes once the memory scan has discovered +- * the topology. +- */ +- if (nid == NUMA_NO_NODE) { ++ memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32)); ++ ++ if (__vphn_get_associativity(i, vphn_assoc) == 0) { ++ nid = associativity_to_nid(vphn_assoc); ++ initialize_form1_numa_distance(vphn_assoc); ++ } else { ++ ++ /* ++ * Don't fall back to default_nid yet -- we will plug ++ * cpus into nodes once the memory scan has discovered ++ * the topology. ++ */ + cpu = of_get_cpu_node(i, NULL); + BUG_ON(!cpu); +- nid = of_node_to_nid_single(cpu); ++ ++ associativity = of_get_associativity(cpu); ++ if (associativity) { ++ nid = associativity_to_nid(associativity); ++ initialize_form1_numa_distance(associativity); ++ } + of_node_put(cpu); + } + +@@ -783,8 +878,11 @@ static int __init parse_numa_properties(void) + * have associativity properties. If none, then + * everything goes to default_nid. + */ +- nid = of_node_to_nid_single(memory); +- if (nid < 0) ++ associativity = of_get_associativity(memory); ++ if (associativity) { ++ nid = associativity_to_nid(associativity); ++ initialize_form1_numa_distance(associativity); ++ } else + nid = default_nid; + + fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid); +diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c +index 325f3b220f360..1f8f97210d143 100644 +--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c ++++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c +@@ -484,6 +484,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index) + return saved_rc; + } + ++ update_numa_distance(dn); ++ + rc = dlpar_online_cpu(dn); + if (rc) { + saved_rc = rc; +diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c +index 7efe6ec5d14a4..a5f968b5fa3a8 100644 +--- a/arch/powerpc/platforms/pseries/hotplug-memory.c ++++ b/arch/powerpc/platforms/pseries/hotplug-memory.c +@@ -180,6 +180,8 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb) + return -ENODEV; + } + ++ update_numa_distance(lmb_node); ++ + dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (!dr_node) { + dlpar_free_cc_nodes(lmb_node); +-- +2.39.2 + diff --git a/queue-5.10/powerpc-pseries-rename-min_common_depth-to-primary_d.patch b/queue-5.10/powerpc-pseries-rename-min_common_depth-to-primary_d.patch new file mode 100644 index 00000000000..13e3dccfc92 --- /dev/null +++ b/queue-5.10/powerpc-pseries-rename-min_common_depth-to-primary_d.patch @@ -0,0 +1,158 @@ +From 9c3309edcac097af0deead823da6e4fa056691d9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 12 Aug 2021 18:52:19 +0530 +Subject: powerpc/pseries: rename min_common_depth to primary_domain_index + +From: Aneesh Kumar K.V + +[ Upstream commit 7e35ef662ca05c42dbc2f401bb76d9219dd7fd02 ] + +No functional change in this patch. + +Signed-off-by: Aneesh Kumar K.V +Reviewed-by: David Gibson +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20210812132223.225214-2-aneesh.kumar@linux.ibm.com +Stable-dep-of: b277fc793daf ("powerpc/papr_scm: Update the NUMA distance table for the target node") +Signed-off-by: Sasha Levin +--- + arch/powerpc/mm/numa.c | 38 +++++++++++++++++++------------------- + 1 file changed, 19 insertions(+), 19 deletions(-) + +diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c +index 275c60f92a7ce..a21f62fcda1e8 100644 +--- a/arch/powerpc/mm/numa.c ++++ b/arch/powerpc/mm/numa.c +@@ -51,7 +51,7 @@ EXPORT_SYMBOL(numa_cpu_lookup_table); + EXPORT_SYMBOL(node_to_cpumask_map); + EXPORT_SYMBOL(node_data); + +-static int min_common_depth; ++static int primary_domain_index; + static int n_mem_addr_cells, n_mem_size_cells; + static int form1_affinity; + +@@ -232,8 +232,8 @@ static int associativity_to_nid(const __be32 *associativity) + if (!numa_enabled) + goto out; + +- if (of_read_number(associativity, 1) >= min_common_depth) +- nid = of_read_number(&associativity[min_common_depth], 1); ++ if (of_read_number(associativity, 1) >= primary_domain_index) ++ nid = of_read_number(&associativity[primary_domain_index], 1); + + /* POWER4 LPAR uses 0xffff as invalid node */ + if (nid == 0xffff || nid >= nr_node_ids) +@@ -284,9 +284,9 @@ int of_node_to_nid(struct device_node *device) + } + EXPORT_SYMBOL(of_node_to_nid); + +-static int __init find_min_common_depth(void) ++static int __init find_primary_domain_index(void) + { +- int depth; ++ int index; + struct device_node *root; + + if (firmware_has_feature(FW_FEATURE_OPAL)) +@@ -326,7 +326,7 @@ static int __init find_min_common_depth(void) + } + + if (form1_affinity) { +- depth = of_read_number(distance_ref_points, 1); ++ index = of_read_number(distance_ref_points, 1); + } else { + if (distance_ref_points_depth < 2) { + printk(KERN_WARNING "NUMA: " +@@ -334,7 +334,7 @@ static int __init find_min_common_depth(void) + goto err; + } + +- depth = of_read_number(&distance_ref_points[1], 1); ++ index = of_read_number(&distance_ref_points[1], 1); + } + + /* +@@ -348,7 +348,7 @@ static int __init find_min_common_depth(void) + } + + of_node_put(root); +- return depth; ++ return index; + + err: + of_node_put(root); +@@ -437,16 +437,16 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb) + int nid = default_nid; + int rc, index; + +- if ((min_common_depth < 0) || !numa_enabled) ++ if ((primary_domain_index < 0) || !numa_enabled) + return default_nid; + + rc = of_get_assoc_arrays(&aa); + if (rc) + return default_nid; + +- if (min_common_depth <= aa.array_sz && ++ if (primary_domain_index <= aa.array_sz && + !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { +- index = lmb->aa_index * aa.array_sz + min_common_depth - 1; ++ index = lmb->aa_index * aa.array_sz + primary_domain_index - 1; + nid = of_read_number(&aa.arrays[index], 1); + + if (nid == 0xffff || nid >= nr_node_ids) +@@ -708,18 +708,18 @@ static int __init parse_numa_properties(void) + return -1; + } + +- min_common_depth = find_min_common_depth(); ++ primary_domain_index = find_primary_domain_index(); + +- if (min_common_depth < 0) { ++ if (primary_domain_index < 0) { + /* +- * if we fail to parse min_common_depth from device tree ++ * if we fail to parse primary_domain_index from device tree + * mark the numa disabled, boot with numa disabled. + */ + numa_enabled = false; +- return min_common_depth; ++ return primary_domain_index; + } + +- dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); ++ dbg("NUMA associativity depth for CPU/Memory: %d\n", primary_domain_index); + + /* + * Even though we connect cpus to numa domains later in SMP +@@ -926,7 +926,7 @@ static void __init find_possible_nodes(void) + goto out; + } + +- max_nodes = of_read_number(&domains[min_common_depth], 1); ++ max_nodes = of_read_number(&domains[primary_domain_index], 1); + pr_info("Partition configured for %d NUMA nodes.\n", max_nodes); + + for (i = 0; i < max_nodes; i++) { +@@ -935,7 +935,7 @@ static void __init find_possible_nodes(void) + } + + prop_length /= sizeof(int); +- if (prop_length > min_common_depth + 2) ++ if (prop_length > primary_domain_index + 2) + coregroup_enabled = 1; + + out: +@@ -1268,7 +1268,7 @@ int cpu_to_coregroup_id(int cpu) + goto out; + + index = of_read_number(associativity, 1); +- if (index > min_common_depth + 1) ++ if (index > primary_domain_index + 1) + return of_read_number(&associativity[index - 1], 1); + + out: +-- +2.39.2 + diff --git a/queue-5.10/powerpc-pseries-rename-type1_affinity-to-form1_affin.patch b/queue-5.10/powerpc-pseries-rename-type1_affinity-to-form1_affin.patch new file mode 100644 index 00000000000..95ff6b91bb8 --- /dev/null +++ b/queue-5.10/powerpc-pseries-rename-type1_affinity-to-form1_affin.patch @@ -0,0 +1,170 @@ +From cacb3c31cf1481761b80e46f88551ddbb0df9d44 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 12 Aug 2021 18:52:20 +0530 +Subject: powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY + +From: Aneesh Kumar K.V + +[ Upstream commit 0eacd06bb8adea8dd9edb0a30144166d9f227e64 ] + +Also make related code cleanup that will allow adding FORM2_AFFINITY in +later patches. No functional change in this patch. + +Signed-off-by: Aneesh Kumar K.V +Reviewed-by: David Gibson +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20210812132223.225214-3-aneesh.kumar@linux.ibm.com +Stable-dep-of: b277fc793daf ("powerpc/papr_scm: Update the NUMA distance table for the target node") +Signed-off-by: Sasha Levin +--- + arch/powerpc/include/asm/firmware.h | 4 +-- + arch/powerpc/include/asm/prom.h | 2 +- + arch/powerpc/kernel/prom_init.c | 2 +- + arch/powerpc/mm/numa.c | 35 ++++++++++++++--------- + arch/powerpc/platforms/pseries/firmware.c | 2 +- + 5 files changed, 26 insertions(+), 19 deletions(-) + +diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h +index aa6a5ef5d4830..0cf648d829f15 100644 +--- a/arch/powerpc/include/asm/firmware.h ++++ b/arch/powerpc/include/asm/firmware.h +@@ -44,7 +44,7 @@ + #define FW_FEATURE_OPAL ASM_CONST(0x0000000010000000) + #define FW_FEATURE_SET_MODE ASM_CONST(0x0000000040000000) + #define FW_FEATURE_BEST_ENERGY ASM_CONST(0x0000000080000000) +-#define FW_FEATURE_TYPE1_AFFINITY ASM_CONST(0x0000000100000000) ++#define FW_FEATURE_FORM1_AFFINITY ASM_CONST(0x0000000100000000) + #define FW_FEATURE_PRRN ASM_CONST(0x0000000200000000) + #define FW_FEATURE_DRMEM_V2 ASM_CONST(0x0000000400000000) + #define FW_FEATURE_DRC_INFO ASM_CONST(0x0000000800000000) +@@ -69,7 +69,7 @@ enum { + FW_FEATURE_SPLPAR | FW_FEATURE_LPAR | + FW_FEATURE_CMO | FW_FEATURE_VPHN | FW_FEATURE_XCMO | + FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY | +- FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN | ++ FW_FEATURE_FORM1_AFFINITY | FW_FEATURE_PRRN | + FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 | + FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE | + FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR | +diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h +index 324a13351749a..df9fec9d232cb 100644 +--- a/arch/powerpc/include/asm/prom.h ++++ b/arch/powerpc/include/asm/prom.h +@@ -147,7 +147,7 @@ extern int of_read_drc_info_cell(struct property **prop, + #define OV5_MSI 0x0201 /* PCIe/MSI support */ + #define OV5_CMO 0x0480 /* Cooperative Memory Overcommitment */ + #define OV5_XCMO 0x0440 /* Page Coalescing */ +-#define OV5_TYPE1_AFFINITY 0x0580 /* Type 1 NUMA affinity */ ++#define OV5_FORM1_AFFINITY 0x0580 /* FORM1 NUMA affinity */ + #define OV5_PRRN 0x0540 /* Platform Resource Reassignment */ + #define OV5_HP_EVT 0x0604 /* Hot Plug Event support */ + #define OV5_RESIZE_HPT 0x0601 /* Hash Page Table resizing */ +diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c +index 9e71c0739f08d..a3bf3587a4162 100644 +--- a/arch/powerpc/kernel/prom_init.c ++++ b/arch/powerpc/kernel/prom_init.c +@@ -1069,7 +1069,7 @@ static const struct ibm_arch_vec ibm_architecture_vec_template __initconst = { + #else + 0, + #endif +- .associativity = OV5_FEAT(OV5_TYPE1_AFFINITY) | OV5_FEAT(OV5_PRRN), ++ .associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | OV5_FEAT(OV5_PRRN), + .bin_opts = OV5_FEAT(OV5_RESIZE_HPT) | OV5_FEAT(OV5_HP_EVT), + .micro_checkpoint = 0, + .reserved0 = 0, +diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c +index a21f62fcda1e8..415cd3d258ff8 100644 +--- a/arch/powerpc/mm/numa.c ++++ b/arch/powerpc/mm/numa.c +@@ -53,7 +53,10 @@ EXPORT_SYMBOL(node_data); + + static int primary_domain_index; + static int n_mem_addr_cells, n_mem_size_cells; +-static int form1_affinity; ++ ++#define FORM0_AFFINITY 0 ++#define FORM1_AFFINITY 1 ++static int affinity_form; + + #define MAX_DISTANCE_REF_POINTS 4 + static int distance_ref_points_depth; +@@ -190,7 +193,7 @@ int __node_distance(int a, int b) + int i; + int distance = LOCAL_DISTANCE; + +- if (!form1_affinity) ++ if (affinity_form == FORM0_AFFINITY) + return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE); + + for (i = 0; i < distance_ref_points_depth; i++) { +@@ -210,7 +213,7 @@ static void initialize_distance_lookup_table(int nid, + { + int i; + +- if (!form1_affinity) ++ if (affinity_form != FORM1_AFFINITY) + return; + + for (i = 0; i < distance_ref_points_depth; i++) { +@@ -289,6 +292,17 @@ static int __init find_primary_domain_index(void) + int index; + struct device_node *root; + ++ /* ++ * Check for which form of affinity. ++ */ ++ if (firmware_has_feature(FW_FEATURE_OPAL)) { ++ affinity_form = FORM1_AFFINITY; ++ } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) { ++ dbg("Using form 1 affinity\n"); ++ affinity_form = FORM1_AFFINITY; ++ } else ++ affinity_form = FORM0_AFFINITY; ++ + if (firmware_has_feature(FW_FEATURE_OPAL)) + root = of_find_node_by_path("/ibm,opal"); + else +@@ -318,23 +332,16 @@ static int __init find_primary_domain_index(void) + } + + distance_ref_points_depth /= sizeof(int); +- +- if (firmware_has_feature(FW_FEATURE_OPAL) || +- firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) { +- dbg("Using form 1 affinity\n"); +- form1_affinity = 1; +- } +- +- if (form1_affinity) { +- index = of_read_number(distance_ref_points, 1); +- } else { ++ if (affinity_form == FORM0_AFFINITY) { + if (distance_ref_points_depth < 2) { + printk(KERN_WARNING "NUMA: " +- "short ibm,associativity-reference-points\n"); ++ "short ibm,associativity-reference-points\n"); + goto err; + } + + index = of_read_number(&distance_ref_points[1], 1); ++ } else { ++ index = of_read_number(distance_ref_points, 1); + } + + /* +diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c +index 4c7b7f5a2ebca..5d4c2bc20bbab 100644 +--- a/arch/powerpc/platforms/pseries/firmware.c ++++ b/arch/powerpc/platforms/pseries/firmware.c +@@ -119,7 +119,7 @@ struct vec5_fw_feature { + + static __initdata struct vec5_fw_feature + vec5_fw_features_table[] = { +- {FW_FEATURE_TYPE1_AFFINITY, OV5_TYPE1_AFFINITY}, ++ {FW_FEATURE_FORM1_AFFINITY, OV5_FORM1_AFFINITY}, + {FW_FEATURE_PRRN, OV5_PRRN}, + {FW_FEATURE_DRMEM_V2, OV5_DRMEM_V2}, + {FW_FEATURE_DRC_INFO, OV5_DRC_INFO}, +-- +2.39.2 + diff --git a/queue-5.10/sched-fair-fix-imbalance-overflow.patch b/queue-5.10/sched-fair-fix-imbalance-overflow.patch new file mode 100644 index 00000000000..49e4addab0d --- /dev/null +++ b/queue-5.10/sched-fair-fix-imbalance-overflow.patch @@ -0,0 +1,48 @@ +From 136d511b9163abd01b1b6231b3562ea166e97929 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 11 Apr 2023 11:06:11 +0200 +Subject: sched/fair: Fix imbalance overflow + +From: Vincent Guittot + +[ Upstream commit 91dcf1e8068e9a8823e419a7a34ff4341275fb70 ] + +When local group is fully busy but its average load is above system load, +computing the imbalance will overflow and local group is not the best +target for pulling this load. + +Fixes: 0b0695f2b34a ("sched/fair: Rework load_balance()") +Reported-by: Tingjia Cao +Signed-off-by: Vincent Guittot +Signed-off-by: Peter Zijlstra (Intel) +Tested-by: Tingjia Cao +Link: https://lore.kernel.org/lkml/CABcWv9_DAhVBOq2=W=2ypKE9dKM5s2DvoV8-U0+GDwwuKZ89jQ@mail.gmail.com/T/ +Signed-off-by: Sasha Levin +--- + kernel/sched/fair.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 22139e97b2a8e..57a58bc48021a 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9353,6 +9353,16 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s + + sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / + sds->total_capacity; ++ ++ /* ++ * If the local group is more loaded than the average system ++ * load, don't try to pull any tasks. ++ */ ++ if (local->avg_load >= sds->avg_load) { ++ env->imbalance = 0; ++ return; ++ } ++ + } + + /* +-- +2.39.2 + diff --git a/queue-5.10/sched-fair-move-calculate-of-avg_load-to-a-better-lo.patch b/queue-5.10/sched-fair-move-calculate-of-avg_load-to-a-better-lo.patch new file mode 100644 index 00000000000..db3f9276cb8 --- /dev/null +++ b/queue-5.10/sched-fair-move-calculate-of-avg_load-to-a-better-lo.patch @@ -0,0 +1,50 @@ +From 59704a62f12269ef7882e6c95a2e60b39f2adc84 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 6 Apr 2022 17:57:05 +0800 +Subject: sched/fair: Move calculate of avg_load to a better location + +From: zgpeng + +[ Upstream commit 06354900787f25bf5be3c07a68e3cdbc5bf0fa69 ] + +In calculate_imbalance function, when the value of local->avg_load is +greater than or equal to busiest->avg_load, the calculated sds->avg_load is +not used. So this calculation can be placed in a more appropriate position. + +Signed-off-by: zgpeng +Signed-off-by: Peter Zijlstra (Intel) +Reviewed-by: Samuel Liao +Reviewed-by: Vincent Guittot +Link: https://lore.kernel.org/r/1649239025-10010-1-git-send-email-zgpeng@tencent.com +Stable-dep-of: 91dcf1e8068e ("sched/fair: Fix imbalance overflow") +Signed-off-by: Sasha Levin +--- + kernel/sched/fair.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index bb70a7856277f..22139e97b2a8e 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9342,8 +9342,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s + local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) / + local->group_capacity; + +- sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / +- sds->total_capacity; + /* + * If the local group is more loaded than the selected + * busiest group don't try to pull any tasks. +@@ -9352,6 +9350,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s + env->imbalance = 0; + return; + } ++ ++ sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / ++ sds->total_capacity; + } + + /* +-- +2.39.2 + diff --git a/queue-5.10/series b/queue-5.10/series index e3bcbf83bb0..1bfc575cee3 100644 --- a/queue-5.10/series +++ b/queue-5.10/series @@ -95,3 +95,16 @@ net-sfp-initialize-sfp-i2c_block_size-at-sfp-allocation.patch scsi-ses-handle-enclosure-with-just-a-primary-component-gracefully.patch x86-pci-add-quirk-for-amd-xhci-controller-that-loses-msi-x-state-in-d3hot.patch cgroup-cpuset-wake-up-cpuset_attach_wq-tasks-in-cpuset_cancel_attach.patch +mtd-ubi-wl-fix-a-couple-of-kernel-doc-issues.patch +ubi-fix-deadlock-caused-by-recursively-holding-work_.patch +ubi-fix-failure-attaching-when-vid_hdr-offset-equals.patch +powerpc-pseries-rename-min_common_depth-to-primary_d.patch +powerpc-pseries-rename-type1_affinity-to-form1_affin.patch +powerpc-pseries-consolidate-different-numa-distance-.patch +powerpc-pseries-add-a-helper-for-form1-cpu-distance.patch +powerpc-pseries-add-support-for-form2-associativity.patch +powerpc-papr_scm-update-the-numa-distance-table-for-.patch +sched-fair-move-calculate-of-avg_load-to-a-better-lo.patch +sched-fair-fix-imbalance-overflow.patch +x86-rtc-remove-__init-for-runtime-functions.patch +i2c-ocores-generate-stop-condition-after-timeout-in-.patch diff --git a/queue-5.10/ubi-fix-deadlock-caused-by-recursively-holding-work_.patch b/queue-5.10/ubi-fix-deadlock-caused-by-recursively-holding-work_.patch new file mode 100644 index 00000000000..c2fe1193651 --- /dev/null +++ b/queue-5.10/ubi-fix-deadlock-caused-by-recursively-holding-work_.patch @@ -0,0 +1,66 @@ +From 86d1fe87a0c24152f0e88d1b54c77c2618f9e2e5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 4 Mar 2023 09:41:41 +0800 +Subject: ubi: Fix deadlock caused by recursively holding work_sem + +From: ZhaoLong Wang + +[ Upstream commit f773f0a331d6c41733b17bebbc1b6cae12e016f5 ] + +During the processing of the bgt, if the sync_erase() return -EBUSY +or some other error code in __erase_worker(),schedule_erase() called +again lead to the down_read(ubi->work_sem) hold twice and may get +block by down_write(ubi->work_sem) in ubi_update_fastmap(), +which cause deadlock. + + ubi bgt other task + do_work + down_read(&ubi->work_sem) ubi_update_fastmap + erase_worker # Blocked by down_read + __erase_worker down_write(&ubi->work_sem) + schedule_erase + schedule_ubi_work + down_read(&ubi->work_sem) + +Fix this by changing input parameter @nested of the schedule_erase() to +'true' to avoid recursively acquiring the down_read(&ubi->work_sem). + +Also, fix the incorrect comment about @nested parameter of the +schedule_erase() because when down_write(ubi->work_sem) is held, the +@nested is also need be true. + +Link: https://bugzilla.kernel.org/show_bug.cgi?id=217093 +Fixes: 2e8f08deabbc ("ubi: Fix races around ubi_refill_pools()") +Signed-off-by: ZhaoLong Wang +Reviewed-by: Zhihao Cheng +Signed-off-by: Richard Weinberger +Signed-off-by: Sasha Levin +--- + drivers/mtd/ubi/wl.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c +index 2ee0e60c43c2e..4427018ad4d9b 100644 +--- a/drivers/mtd/ubi/wl.c ++++ b/drivers/mtd/ubi/wl.c +@@ -575,7 +575,7 @@ static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk, + * @vol_id: the volume ID that last used this PEB + * @lnum: the last used logical eraseblock number for the PEB + * @torture: if the physical eraseblock has to be tortured +- * @nested: denotes whether the work_sem is already held in read mode ++ * @nested: denotes whether the work_sem is already held + * + * This function returns zero in case of success and a %-ENOMEM in case of + * failure. +@@ -1121,7 +1121,7 @@ static int __erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk) + int err1; + + /* Re-schedule the LEB for erasure */ +- err1 = schedule_erase(ubi, e, vol_id, lnum, 0, false); ++ err1 = schedule_erase(ubi, e, vol_id, lnum, 0, true); + if (err1) { + spin_lock(&ubi->wl_lock); + wl_entry_destroy(ubi, e); +-- +2.39.2 + diff --git a/queue-5.10/ubi-fix-failure-attaching-when-vid_hdr-offset-equals.patch b/queue-5.10/ubi-fix-failure-attaching-when-vid_hdr-offset-equals.patch new file mode 100644 index 00000000000..fa94e16865c --- /dev/null +++ b/queue-5.10/ubi-fix-failure-attaching-when-vid_hdr-offset-equals.patch @@ -0,0 +1,79 @@ +From 4a6d08de55811e838cffbbdba0beb842f635d9f6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 6 Mar 2023 09:33:08 +0800 +Subject: ubi: Fix failure attaching when vid_hdr offset equals to (sub)page + size + +From: Zhihao Cheng + +[ Upstream commit 1e020e1b96afdecd20680b5b5be2a6ffc3d27628 ] + +Following process will make ubi attaching failed since commit +1b42b1a36fc946 ("ubi: ensure that VID header offset ... size"): + +ID="0xec,0xa1,0x00,0x15" # 128M 128KB 2KB +modprobe nandsim id_bytes=$ID +flash_eraseall /dev/mtd0 +modprobe ubi mtd="0,2048" # set vid_hdr offset as 2048 (one page) +(dmesg): + ubi0 error: ubi_attach_mtd_dev [ubi]: VID header offset 2048 too large. + UBI error: cannot attach mtd0 + UBI error: cannot initialize UBI, error -22 + +Rework original solution, the key point is making sure +'vid_hdr_shift + UBI_VID_HDR_SIZE < ubi->vid_hdr_alsize', +so we should check vid_hdr_shift rather not vid_hdr_offset. +Then, ubi still support (sub)page aligined VID header offset. + +Fixes: 1b42b1a36fc946 ("ubi: ensure that VID header offset ... size") +Signed-off-by: Zhihao Cheng +Tested-by: Nicolas Schichan +Tested-by: Miquel Raynal # v5.10, v4.19 +Signed-off-by: Richard Weinberger +Signed-off-by: Sasha Levin +--- + drivers/mtd/ubi/build.c | 21 +++++++++++++++------ + 1 file changed, 15 insertions(+), 6 deletions(-) + +diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c +index e45fdc1bf66a4..929ce489b0629 100644 +--- a/drivers/mtd/ubi/build.c ++++ b/drivers/mtd/ubi/build.c +@@ -665,12 +665,6 @@ static int io_init(struct ubi_device *ubi, int max_beb_per1024) + ubi->ec_hdr_alsize = ALIGN(UBI_EC_HDR_SIZE, ubi->hdrs_min_io_size); + ubi->vid_hdr_alsize = ALIGN(UBI_VID_HDR_SIZE, ubi->hdrs_min_io_size); + +- if (ubi->vid_hdr_offset && ((ubi->vid_hdr_offset + UBI_VID_HDR_SIZE) > +- ubi->vid_hdr_alsize)) { +- ubi_err(ubi, "VID header offset %d too large.", ubi->vid_hdr_offset); +- return -EINVAL; +- } +- + dbg_gen("min_io_size %d", ubi->min_io_size); + dbg_gen("max_write_size %d", ubi->max_write_size); + dbg_gen("hdrs_min_io_size %d", ubi->hdrs_min_io_size); +@@ -688,6 +682,21 @@ static int io_init(struct ubi_device *ubi, int max_beb_per1024) + ubi->vid_hdr_aloffset; + } + ++ /* ++ * Memory allocation for VID header is ubi->vid_hdr_alsize ++ * which is described in comments in io.c. ++ * Make sure VID header shift + UBI_VID_HDR_SIZE not exceeds ++ * ubi->vid_hdr_alsize, so that all vid header operations ++ * won't access memory out of bounds. ++ */ ++ if ((ubi->vid_hdr_shift + UBI_VID_HDR_SIZE) > ubi->vid_hdr_alsize) { ++ ubi_err(ubi, "Invalid VID header offset %d, VID header shift(%d)" ++ " + VID header size(%zu) > VID header aligned size(%d).", ++ ubi->vid_hdr_offset, ubi->vid_hdr_shift, ++ UBI_VID_HDR_SIZE, ubi->vid_hdr_alsize); ++ return -EINVAL; ++ } ++ + /* Similar for the data offset */ + ubi->leb_start = ubi->vid_hdr_offset + UBI_VID_HDR_SIZE; + ubi->leb_start = ALIGN(ubi->leb_start, ubi->min_io_size); +-- +2.39.2 + diff --git a/queue-5.10/x86-rtc-remove-__init-for-runtime-functions.patch b/queue-5.10/x86-rtc-remove-__init-for-runtime-functions.patch new file mode 100644 index 00000000000..b2181b4c7e3 --- /dev/null +++ b/queue-5.10/x86-rtc-remove-__init-for-runtime-functions.patch @@ -0,0 +1,53 @@ +From c6cd6c861acab70decc185158b61542c40fe1952 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 6 Apr 2023 08:26:52 +0200 +Subject: x86/rtc: Remove __init for runtime functions + +From: Matija Glavinic Pecotic + +[ Upstream commit 775d3c514c5b2763a50ab7839026d7561795924d ] + +set_rtc_noop(), get_rtc_noop() are after booting, therefore their __init +annotation is wrong. + +A crash was observed on an x86 platform where CMOS RTC is unused and +disabled via device tree. set_rtc_noop() was invoked from ntp: +sync_hw_clock(), although CONFIG_RTC_SYSTOHC=n, however sync_cmos_clock() +doesn't honour that. + + Workqueue: events_power_efficient sync_hw_clock + RIP: 0010:set_rtc_noop + Call Trace: + update_persistent_clock64 + sync_hw_clock + +Fix this by dropping the __init annotation from set/get_rtc_noop(). + +Fixes: c311ed6183f4 ("x86/init: Allow DT configured systems to disable RTC at boot time") +Signed-off-by: Matija Glavinic Pecotic +Signed-off-by: Thomas Gleixner +Reviewed-by: Andy Shevchenko +Link: https://lore.kernel.org/r/59f7ceb1-446b-1d3d-0bc8-1f0ee94b1e18@nokia.com +Signed-off-by: Sasha Levin +--- + arch/x86/kernel/x86_init.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c +index a3038d8deb6a4..b758eeea6090b 100644 +--- a/arch/x86/kernel/x86_init.c ++++ b/arch/x86/kernel/x86_init.c +@@ -32,8 +32,8 @@ static int __init iommu_init_noop(void) { return 0; } + static void iommu_shutdown_noop(void) { } + bool __init bool_x86_init_noop(void) { return false; } + void x86_op_int_noop(int cpu) { } +-static __init int set_rtc_noop(const struct timespec64 *now) { return -EINVAL; } +-static __init void get_rtc_noop(struct timespec64 *now) { } ++static int set_rtc_noop(const struct timespec64 *now) { return -EINVAL; } ++static void get_rtc_noop(struct timespec64 *now) { } + + static __initconst const struct of_device_id of_cmos_match[] = { + { .compatible = "motorola,mc146818" }, +-- +2.39.2 + -- 2.47.3