From f4fe49d477c7f12aedb44e91b59559927bea282e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 19 Jan 2020 16:42:21 +0100 Subject: [PATCH] 5.4-stable patches added patches: arm-dts-imx6q-icore-mipi-use-1.5-version-of-i.core-mx6dl.patch arm-dts-imx6qdl-sabresd-remove-incorrect-power-supply-assignment.patch arm-dts-imx6sl-evk-remove-incorrect-power-supply-assignment.patch arm-dts-imx6sll-evk-remove-incorrect-power-supply-assignment.patch arm-dts-imx6sx-sdb-remove-incorrect-power-supply-assignment.patch arm-dts-imx7-fix-toradex-colibri-imx7s-256mb-nand-flash-support.patch block-fix-the-type-of-sts-in-bsg_queue_rq.patch btrfs-always-copy-scrub-arguments-back-to-user-space.patch btrfs-check-rw_devices-not-num_devices-for-balance.patch btrfs-do-not-delete-mismatched-root-refs.patch btrfs-fix-invalid-removal-of-root-ref.patch btrfs-fix-memory-leak-in-qgroup-accounting.patch btrfs-relocation-fix-reloc_root-lifespan-and-access.patch mm-debug_pagealloc-don-t-rely-on-static-keys-too-early.patch mm-huge_memory.c-thp-fix-conflict-of-above-47bit-hint-address-and-pmd-alignment.patch mm-khugepaged-add-trace-status-description-for-scan_page_has_private.patch mm-memcg-slab-call-flush_memcg_workqueue-only-if-memcg-workqueue-is-valid.patch mm-memcg-slab-fix-percpu-slab-vmstats-flushing.patch mm-memory_hotplug-don-t-free-usage-map-when-removing-a-re-added-early-section.patch mm-page-writeback.c-avoid-potential-division-by-zero-in-wb_min_max_ratio.patch mm-shmem.c-thp-shmem-fix-conflict-of-above-47bit-hint-address-and-pmd-alignment.patch net-fix-kernel-doc-warning-in-linux-netdevice.h.patch net-stmmac-16kb-buffer-must-be-16-byte-aligned.patch net-stmmac-enable-16kb-buffer-size.patch reset-fix-of-devm-_reset_control_array_get-kerneldoc-return-types.patch tipc-fix-potential-hanging-after-b-rcast-changing.patch tipc-fix-retrans-failure-due-to-wrong-destination.patch --- ...mipi-use-1.5-version-of-i.core-mx6dl.patch | 45 +++ ...ve-incorrect-power-supply-assignment.patch | 39 +++ ...ve-incorrect-power-supply-assignment.patch | 39 +++ ...ve-incorrect-power-supply-assignment.patch | 39 +++ ...ve-incorrect-power-supply-assignment.patch | 53 ++++ ...libri-imx7s-256mb-nand-flash-support.patch | 31 +++ ...-fix-the-type-of-sts-in-bsg_queue_rq.patch | 39 +++ ...y-scrub-arguments-back-to-user-space.patch | 64 +++++ ..._devices-not-num_devices-for-balance.patch | 92 ++++++ ...s-do-not-delete-mismatched-root-refs.patch | 45 +++ ...trfs-fix-invalid-removal-of-root-ref.patch | 89 ++++++ ...fix-memory-leak-in-qgroup-accounting.patch | 80 ++++++ ...n-fix-reloc_root-lifespan-and-access.patch | 234 ++++++++++++++++ ...-don-t-rely-on-static-keys-too-early.patch | 263 ++++++++++++++++++ ...47bit-hint-address-and-pmd-alignment.patch | 134 +++++++++ ...escription-for-scan_page_has_private.patch | 39 +++ ...eue-only-if-memcg-workqueue-is-valid.patch | 85 ++++++ ...lab-fix-percpu-slab-vmstats-flushing.patch | 178 ++++++++++++ ...en-removing-a-re-added-early-section.patch | 97 +++++++ ...division-by-zero-in-wb_min_max_ratio.patch | 79 ++++++ ...47bit-hint-address-and-pmd-alignment.patch | 74 +++++ ...nel-doc-warning-in-linux-netdevice.h.patch | 34 +++ ...-16kb-buffer-must-be-16-byte-aligned.patch | 34 +++ .../net-stmmac-enable-16kb-buffer-size.patch | 34 +++ ...rol_array_get-kerneldoc-return-types.patch | 46 +++ queue-5.4/series | 27 ++ ...ntial-hanging-after-b-rcast-changing.patch | 105 +++++++ ...ans-failure-due-to-wrong-destination.patch | 100 +++++++ 28 files changed, 2218 insertions(+) create mode 100644 queue-5.4/arm-dts-imx6q-icore-mipi-use-1.5-version-of-i.core-mx6dl.patch create mode 100644 queue-5.4/arm-dts-imx6qdl-sabresd-remove-incorrect-power-supply-assignment.patch create mode 100644 queue-5.4/arm-dts-imx6sl-evk-remove-incorrect-power-supply-assignment.patch create mode 100644 queue-5.4/arm-dts-imx6sll-evk-remove-incorrect-power-supply-assignment.patch create mode 100644 queue-5.4/arm-dts-imx6sx-sdb-remove-incorrect-power-supply-assignment.patch create mode 100644 queue-5.4/arm-dts-imx7-fix-toradex-colibri-imx7s-256mb-nand-flash-support.patch create mode 100644 queue-5.4/block-fix-the-type-of-sts-in-bsg_queue_rq.patch create mode 100644 queue-5.4/btrfs-always-copy-scrub-arguments-back-to-user-space.patch create mode 100644 queue-5.4/btrfs-check-rw_devices-not-num_devices-for-balance.patch create mode 100644 queue-5.4/btrfs-do-not-delete-mismatched-root-refs.patch create mode 100644 queue-5.4/btrfs-fix-invalid-removal-of-root-ref.patch create mode 100644 queue-5.4/btrfs-fix-memory-leak-in-qgroup-accounting.patch create mode 100644 queue-5.4/btrfs-relocation-fix-reloc_root-lifespan-and-access.patch create mode 100644 queue-5.4/mm-debug_pagealloc-don-t-rely-on-static-keys-too-early.patch create mode 100644 queue-5.4/mm-huge_memory.c-thp-fix-conflict-of-above-47bit-hint-address-and-pmd-alignment.patch create mode 100644 queue-5.4/mm-khugepaged-add-trace-status-description-for-scan_page_has_private.patch create mode 100644 queue-5.4/mm-memcg-slab-call-flush_memcg_workqueue-only-if-memcg-workqueue-is-valid.patch create mode 100644 queue-5.4/mm-memcg-slab-fix-percpu-slab-vmstats-flushing.patch create mode 100644 queue-5.4/mm-memory_hotplug-don-t-free-usage-map-when-removing-a-re-added-early-section.patch create mode 100644 queue-5.4/mm-page-writeback.c-avoid-potential-division-by-zero-in-wb_min_max_ratio.patch create mode 100644 queue-5.4/mm-shmem.c-thp-shmem-fix-conflict-of-above-47bit-hint-address-and-pmd-alignment.patch create mode 100644 queue-5.4/net-fix-kernel-doc-warning-in-linux-netdevice.h.patch create mode 100644 queue-5.4/net-stmmac-16kb-buffer-must-be-16-byte-aligned.patch create mode 100644 queue-5.4/net-stmmac-enable-16kb-buffer-size.patch create mode 100644 queue-5.4/reset-fix-of-devm-_reset_control_array_get-kerneldoc-return-types.patch create mode 100644 queue-5.4/tipc-fix-potential-hanging-after-b-rcast-changing.patch create mode 100644 queue-5.4/tipc-fix-retrans-failure-due-to-wrong-destination.patch diff --git a/queue-5.4/arm-dts-imx6q-icore-mipi-use-1.5-version-of-i.core-mx6dl.patch b/queue-5.4/arm-dts-imx6q-icore-mipi-use-1.5-version-of-i.core-mx6dl.patch new file mode 100644 index 00000000000..5cfeef751f7 --- /dev/null +++ b/queue-5.4/arm-dts-imx6q-icore-mipi-use-1.5-version-of-i.core-mx6dl.patch @@ -0,0 +1,45 @@ +From 4a132f60808ae3a751e107a373f8572012352d3c Mon Sep 17 00:00:00 2001 +From: Jagan Teki +Date: Mon, 30 Dec 2019 17:30:19 +0530 +Subject: ARM: dts: imx6q-icore-mipi: Use 1.5 version of i.Core MX6DL + +From: Jagan Teki + +commit 4a132f60808ae3a751e107a373f8572012352d3c upstream. + +The EDIMM STARTER KIT i.Core 1.5 MIPI Evaluation is based on +the 1.5 version of the i.Core MX6 cpu module. The 1.5 version +differs from the original one for a few details, including the +ethernet PHY interface clock provider. + +With this commit, the ethernet interface works properly: +SMSC LAN8710/LAN8720 2188000.ethernet-1:00: attached PHY driver + +While before using the 1.5 version, ethernet failed to startup +do to un-clocked PHY interface: +fec 2188000.ethernet eth0: could not attach to PHY + +Similar fix has merged for i.Core MX6Q but missed to update for DL. + +Fixes: a8039f2dd089 ("ARM: dts: imx6dl: Add Engicam i.CoreM6 1.5 Quad/Dual MIPI starter kit support") +Cc: Jacopo Mondi +Signed-off-by: Michael Trimarchi +Signed-off-by: Jagan Teki +Signed-off-by: Shawn Guo +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arm/boot/dts/imx6dl-icore-mipi.dts | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/arm/boot/dts/imx6dl-icore-mipi.dts ++++ b/arch/arm/boot/dts/imx6dl-icore-mipi.dts +@@ -8,7 +8,7 @@ + /dts-v1/; + + #include "imx6dl.dtsi" +-#include "imx6qdl-icore.dtsi" ++#include "imx6qdl-icore-1.5.dtsi" + + / { + model = "Engicam i.CoreM6 DualLite/Solo MIPI Starter Kit"; diff --git a/queue-5.4/arm-dts-imx6qdl-sabresd-remove-incorrect-power-supply-assignment.patch b/queue-5.4/arm-dts-imx6qdl-sabresd-remove-incorrect-power-supply-assignment.patch new file mode 100644 index 00000000000..e6a490a60f0 --- /dev/null +++ b/queue-5.4/arm-dts-imx6qdl-sabresd-remove-incorrect-power-supply-assignment.patch @@ -0,0 +1,39 @@ +From 4521de30fbb3f5be0db58de93582ebce72c9d44f Mon Sep 17 00:00:00 2001 +From: Anson Huang +Date: Mon, 30 Dec 2019 09:41:07 +0800 +Subject: ARM: dts: imx6qdl-sabresd: Remove incorrect power supply assignment + +From: Anson Huang + +commit 4521de30fbb3f5be0db58de93582ebce72c9d44f upstream. + +The vdd3p0 LDO's input should be from external USB VBUS directly, NOT +PMIC's power supply, the vdd3p0 LDO's target output voltage can be +controlled by SW, and it requires input voltage to be high enough, with +incorrect power supply assigned, if the power supply's voltage is lower +than the LDO target output voltage, it will return fail and skip the LDO +voltage adjustment, so remove the power supply assignment for vdd3p0 to +avoid such scenario. + +Fixes: 93385546ba36 ("ARM: dts: imx6qdl-sabresd: Assign corresponding power supply for LDOs") +Signed-off-by: Anson Huang +Signed-off-by: Shawn Guo +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arm/boot/dts/imx6qdl-sabresd.dtsi | 4 ---- + 1 file changed, 4 deletions(-) + +--- a/arch/arm/boot/dts/imx6qdl-sabresd.dtsi ++++ b/arch/arm/boot/dts/imx6qdl-sabresd.dtsi +@@ -749,10 +749,6 @@ + vin-supply = <&vgen5_reg>; + }; + +-®_vdd3p0 { +- vin-supply = <&sw2_reg>; +-}; +- + ®_vdd2p5 { + vin-supply = <&vgen5_reg>; + }; diff --git a/queue-5.4/arm-dts-imx6sl-evk-remove-incorrect-power-supply-assignment.patch b/queue-5.4/arm-dts-imx6sl-evk-remove-incorrect-power-supply-assignment.patch new file mode 100644 index 00000000000..6fcf5be093b --- /dev/null +++ b/queue-5.4/arm-dts-imx6sl-evk-remove-incorrect-power-supply-assignment.patch @@ -0,0 +1,39 @@ +From b4eb9ef0e29cd28c6fd684e0ab77bda824acb20e Mon Sep 17 00:00:00 2001 +From: Anson Huang +Date: Mon, 30 Dec 2019 09:41:09 +0800 +Subject: ARM: dts: imx6sl-evk: Remove incorrect power supply assignment + +From: Anson Huang + +commit b4eb9ef0e29cd28c6fd684e0ab77bda824acb20e upstream. + +The vdd3p0 LDO's input should be from external USB VBUS directly, NOT +PMIC's power supply, the vdd3p0 LDO's target output voltage can be +controlled by SW, and it requires input voltage to be high enough, with +incorrect power supply assigned, if the power supply's voltage is lower +than the LDO target output voltage, it will return fail and skip the LDO +voltage adjustment, so remove the power supply assignment for vdd3p0 to +avoid such scenario. + +Fixes: 3feea8805d6f ("ARM: dts: imx6sl-evk: Assign corresponding power supply for LDOs") +Signed-off-by: Anson Huang +Signed-off-by: Shawn Guo +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arm/boot/dts/imx6sl-evk.dts | 4 ---- + 1 file changed, 4 deletions(-) + +--- a/arch/arm/boot/dts/imx6sl-evk.dts ++++ b/arch/arm/boot/dts/imx6sl-evk.dts +@@ -584,10 +584,6 @@ + vin-supply = <&sw2_reg>; + }; + +-®_vdd3p0 { +- vin-supply = <&sw2_reg>; +-}; +- + ®_vdd2p5 { + vin-supply = <&sw2_reg>; + }; diff --git a/queue-5.4/arm-dts-imx6sll-evk-remove-incorrect-power-supply-assignment.patch b/queue-5.4/arm-dts-imx6sll-evk-remove-incorrect-power-supply-assignment.patch new file mode 100644 index 00000000000..dbc84c70650 --- /dev/null +++ b/queue-5.4/arm-dts-imx6sll-evk-remove-incorrect-power-supply-assignment.patch @@ -0,0 +1,39 @@ +From 3479b2843c78ffb60247f522226ba68f93aee355 Mon Sep 17 00:00:00 2001 +From: Anson Huang +Date: Mon, 30 Dec 2019 09:41:10 +0800 +Subject: ARM: dts: imx6sll-evk: Remove incorrect power supply assignment + +From: Anson Huang + +commit 3479b2843c78ffb60247f522226ba68f93aee355 upstream. + +The vdd3p0 LDO's input should be from external USB VBUS directly, NOT +PMIC's power supply, the vdd3p0 LDO's target output voltage can be +controlled by SW, and it requires input voltage to be high enough, with +incorrect power supply assigned, if the power supply's voltage is lower +than the LDO target output voltage, it will return fail and skip the LDO +voltage adjustment, so remove the power supply assignment for vdd3p0 to +avoid such scenario. + +Fixes: 96a9169cf621 ("ARM: dts: imx6sll-evk: Assign corresponding power supply for vdd3p0") +Signed-off-by: Anson Huang +Signed-off-by: Shawn Guo +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arm/boot/dts/imx6sll-evk.dts | 4 ---- + 1 file changed, 4 deletions(-) + +--- a/arch/arm/boot/dts/imx6sll-evk.dts ++++ b/arch/arm/boot/dts/imx6sll-evk.dts +@@ -265,10 +265,6 @@ + status = "okay"; + }; + +-®_3p0 { +- vin-supply = <&sw2_reg>; +-}; +- + &snvs_poweroff { + status = "okay"; + }; diff --git a/queue-5.4/arm-dts-imx6sx-sdb-remove-incorrect-power-supply-assignment.patch b/queue-5.4/arm-dts-imx6sx-sdb-remove-incorrect-power-supply-assignment.patch new file mode 100644 index 00000000000..94fab0e21ea --- /dev/null +++ b/queue-5.4/arm-dts-imx6sx-sdb-remove-incorrect-power-supply-assignment.patch @@ -0,0 +1,53 @@ +From d4918ebb5c256d26696a13e78ac68c146111191a Mon Sep 17 00:00:00 2001 +From: Anson Huang +Date: Mon, 30 Dec 2019 09:41:08 +0800 +Subject: ARM: dts: imx6sx-sdb: Remove incorrect power supply assignment + +From: Anson Huang + +commit d4918ebb5c256d26696a13e78ac68c146111191a upstream. + +The vdd3p0 LDO's input should be from external USB VBUS directly, NOT +PMIC's power supply, the vdd3p0 LDO's target output voltage can be +controlled by SW, and it requires input voltage to be high enough, with +incorrect power supply assigned, if the power supply's voltage is lower +than the LDO target output voltage, it will return fail and skip the LDO +voltage adjustment, so remove the power supply assignment for vdd3p0 to +avoid such scenario. + +Fixes: 37a4bdead109 ("ARM: dts: imx6sx-sdb: Assign corresponding power supply for LDOs") +Signed-off-by: Anson Huang +Signed-off-by: Shawn Guo +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arm/boot/dts/imx6sx-sdb-reva.dts | 4 ---- + arch/arm/boot/dts/imx6sx-sdb.dts | 4 ---- + 2 files changed, 8 deletions(-) + +--- a/arch/arm/boot/dts/imx6sx-sdb-reva.dts ++++ b/arch/arm/boot/dts/imx6sx-sdb-reva.dts +@@ -159,10 +159,6 @@ + vin-supply = <&vgen6_reg>; + }; + +-®_vdd3p0 { +- vin-supply = <&sw2_reg>; +-}; +- + ®_vdd2p5 { + vin-supply = <&vgen6_reg>; + }; +--- a/arch/arm/boot/dts/imx6sx-sdb.dts ++++ b/arch/arm/boot/dts/imx6sx-sdb.dts +@@ -141,10 +141,6 @@ + vin-supply = <&vgen6_reg>; + }; + +-®_vdd3p0 { +- vin-supply = <&sw2_reg>; +-}; +- + ®_vdd2p5 { + vin-supply = <&vgen6_reg>; + }; diff --git a/queue-5.4/arm-dts-imx7-fix-toradex-colibri-imx7s-256mb-nand-flash-support.patch b/queue-5.4/arm-dts-imx7-fix-toradex-colibri-imx7s-256mb-nand-flash-support.patch new file mode 100644 index 00000000000..9c33926ba69 --- /dev/null +++ b/queue-5.4/arm-dts-imx7-fix-toradex-colibri-imx7s-256mb-nand-flash-support.patch @@ -0,0 +1,31 @@ +From 4b0b97e651ecf29f20248420b52b6864fbd40bc2 Mon Sep 17 00:00:00 2001 +From: Marcel Ziswiler +Date: Wed, 8 Jan 2020 17:12:31 +0100 +Subject: ARM: dts: imx7: Fix Toradex Colibri iMX7S 256MB NAND flash support + +From: Marcel Ziswiler + +commit 4b0b97e651ecf29f20248420b52b6864fbd40bc2 upstream. + +Turns out when introducing the eMMC version the gpmi node required for +NAND flash support got enabled exclusively on Colibri iMX7D 512MB. + +Fixes: f928a4a377e4 ("ARM: dts: imx7: add Toradex Colibri iMX7D 1GB (eMMC) support") +Signed-off-by: Marcel Ziswiler +Signed-off-by: Shawn Guo +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arm/boot/dts/imx7s-colibri.dtsi | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/arch/arm/boot/dts/imx7s-colibri.dtsi ++++ b/arch/arm/boot/dts/imx7s-colibri.dtsi +@@ -49,3 +49,7 @@ + reg = <0x80000000 0x10000000>; + }; + }; ++ ++&gpmi { ++ status = "okay"; ++}; diff --git a/queue-5.4/block-fix-the-type-of-sts-in-bsg_queue_rq.patch b/queue-5.4/block-fix-the-type-of-sts-in-bsg_queue_rq.patch new file mode 100644 index 00000000000..cff1667283b --- /dev/null +++ b/queue-5.4/block-fix-the-type-of-sts-in-bsg_queue_rq.patch @@ -0,0 +1,39 @@ +From c44a4edb20938c85b64a256661443039f5bffdea Mon Sep 17 00:00:00 2001 +From: Bart Van Assche +Date: Tue, 17 Dec 2019 16:23:29 -0800 +Subject: block: Fix the type of 'sts' in bsg_queue_rq() + +From: Bart Van Assche + +commit c44a4edb20938c85b64a256661443039f5bffdea upstream. + +This patch fixes the following sparse warnings: + +block/bsg-lib.c:269:19: warning: incorrect type in initializer (different base types) +block/bsg-lib.c:269:19: expected int sts +block/bsg-lib.c:269:19: got restricted blk_status_t [usertype] +block/bsg-lib.c:286:16: warning: incorrect type in return expression (different base types) +block/bsg-lib.c:286:16: expected restricted blk_status_t +block/bsg-lib.c:286:16: got int [assigned] sts + +Cc: Martin Wilck +Fixes: d46fe2cb2dce ("block: drop device references in bsg_queue_rq()") +Signed-off-by: Bart Van Assche +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman + +--- + block/bsg-lib.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/block/bsg-lib.c ++++ b/block/bsg-lib.c +@@ -266,7 +266,7 @@ static blk_status_t bsg_queue_rq(struct + struct request *req = bd->rq; + struct bsg_set *bset = + container_of(q->tag_set, struct bsg_set, tag_set); +- int sts = BLK_STS_IOERR; ++ blk_status_t sts = BLK_STS_IOERR; + int ret; + + blk_mq_start_request(req); diff --git a/queue-5.4/btrfs-always-copy-scrub-arguments-back-to-user-space.patch b/queue-5.4/btrfs-always-copy-scrub-arguments-back-to-user-space.patch new file mode 100644 index 00000000000..64612c6e92d --- /dev/null +++ b/queue-5.4/btrfs-always-copy-scrub-arguments-back-to-user-space.patch @@ -0,0 +1,64 @@ +From 5afe6ce748c1ea99e0d648153c05075e1ab93afb Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Thu, 16 Jan 2020 11:29:20 +0000 +Subject: Btrfs: always copy scrub arguments back to user space + +From: Filipe Manana + +commit 5afe6ce748c1ea99e0d648153c05075e1ab93afb upstream. + +If scrub returns an error we are not copying back the scrub arguments +structure to user space. This prevents user space to know how much +progress scrub has done if an error happened - this includes -ECANCELED +which is returned when users ask for scrub to stop. A particular use +case, which is used in btrfs-progs, is to resume scrub after it is +canceled, in that case it relies on checking the progress from the scrub +arguments structure and then use that progress in a call to resume +scrub. + +So fix this by always copying the scrub arguments structure to user +space, overwriting the value returned to user space with -EFAULT only if +copying the structure failed to let user space know that either that +copying did not happen, and therefore the structure is stale, or it +happened partially and the structure is probably not valid and corrupt +due to the partial copy. + +Reported-by: Graham Cobb +Link: https://lore.kernel.org/linux-btrfs/d0a97688-78be-08de-ca7d-bcb4c7fb397e@cobb.uk.net/ +Fixes: 06fe39ab15a6a4 ("Btrfs: do not overwrite scrub error with fault error in scrub ioctl") +CC: stable@vger.kernel.org # 5.1+ +Reviewed-by: Johannes Thumshirn +Reviewed-by: Qu Wenruo +Tested-by: Graham Cobb +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ioctl.c | 14 +++++++++++++- + 1 file changed, 13 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -4254,7 +4254,19 @@ static long btrfs_ioctl_scrub(struct fil + &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, + 0); + +- if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa))) ++ /* ++ * Copy scrub args to user space even if btrfs_scrub_dev() returned an ++ * error. This is important as it allows user space to know how much ++ * progress scrub has done. For example, if scrub is canceled we get ++ * -ECANCELED from btrfs_scrub_dev() and return that error back to user ++ * space. Later user space can inspect the progress from the structure ++ * btrfs_ioctl_scrub_args and resume scrub from where it left off ++ * previously (btrfs-progs does this). ++ * If we fail to copy the btrfs_ioctl_scrub_args structure to user space ++ * then return -EFAULT to signal the structure was not copied or it may ++ * be corrupt and unreliable due to a partial copy. ++ */ ++ if (copy_to_user(arg, sa, sizeof(*sa))) + ret = -EFAULT; + + if (!(sa->flags & BTRFS_SCRUB_READONLY)) diff --git a/queue-5.4/btrfs-check-rw_devices-not-num_devices-for-balance.patch b/queue-5.4/btrfs-check-rw_devices-not-num_devices-for-balance.patch new file mode 100644 index 00000000000..e19180b3db5 --- /dev/null +++ b/queue-5.4/btrfs-check-rw_devices-not-num_devices-for-balance.patch @@ -0,0 +1,92 @@ +From b35cf1f0bf1f2b0b193093338414b9bd63b29015 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Fri, 10 Jan 2020 11:11:24 -0500 +Subject: btrfs: check rw_devices, not num_devices for balance + +From: Josef Bacik + +commit b35cf1f0bf1f2b0b193093338414b9bd63b29015 upstream. + +The fstest btrfs/154 reports + + [ 8675.381709] BTRFS: Transaction aborted (error -28) + [ 8675.383302] WARNING: CPU: 1 PID: 31900 at fs/btrfs/block-group.c:2038 btrfs_create_pending_block_groups+0x1e0/0x1f0 [btrfs] + [ 8675.390925] CPU: 1 PID: 31900 Comm: btrfs Not tainted 5.5.0-rc6-default+ #935 + [ 8675.392780] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014 + [ 8675.395452] RIP: 0010:btrfs_create_pending_block_groups+0x1e0/0x1f0 [btrfs] + [ 8675.402672] RSP: 0018:ffffb2090888fb00 EFLAGS: 00010286 + [ 8675.404413] RAX: 0000000000000000 RBX: ffff92026dfa91c8 RCX: 0000000000000001 + [ 8675.406609] RDX: 0000000000000000 RSI: ffffffff8e100899 RDI: ffffffff8e100971 + [ 8675.408775] RBP: ffff920247c61660 R08: 0000000000000000 R09: 0000000000000000 + [ 8675.410978] R10: 0000000000000000 R11: 0000000000000000 R12: 00000000ffffffe4 + [ 8675.412647] R13: ffff92026db74000 R14: ffff920247c616b8 R15: ffff92026dfbc000 + [ 8675.413994] FS: 00007fd5e57248c0(0000) GS:ffff92027d800000(0000) knlGS:0000000000000000 + [ 8675.416146] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + [ 8675.417833] CR2: 0000564aa51682d8 CR3: 000000006dcbc004 CR4: 0000000000160ee0 + [ 8675.419801] Call Trace: + [ 8675.420742] btrfs_start_dirty_block_groups+0x355/0x480 [btrfs] + [ 8675.422600] btrfs_commit_transaction+0xc8/0xaf0 [btrfs] + [ 8675.424335] reset_balance_state+0x14a/0x190 [btrfs] + [ 8675.425824] btrfs_balance.cold+0xe7/0x154 [btrfs] + [ 8675.427313] ? kmem_cache_alloc_trace+0x235/0x2c0 + [ 8675.428663] btrfs_ioctl_balance+0x298/0x350 [btrfs] + [ 8675.430285] btrfs_ioctl+0x466/0x2550 [btrfs] + [ 8675.431788] ? mem_cgroup_charge_statistics+0x51/0xf0 + [ 8675.433487] ? mem_cgroup_commit_charge+0x56/0x400 + [ 8675.435122] ? do_raw_spin_unlock+0x4b/0xc0 + [ 8675.436618] ? _raw_spin_unlock+0x1f/0x30 + [ 8675.438093] ? __handle_mm_fault+0x499/0x740 + [ 8675.439619] ? do_vfs_ioctl+0x56e/0x770 + [ 8675.441034] do_vfs_ioctl+0x56e/0x770 + [ 8675.442411] ksys_ioctl+0x3a/0x70 + [ 8675.443718] ? trace_hardirqs_off_thunk+0x1a/0x1c + [ 8675.445333] __x64_sys_ioctl+0x16/0x20 + [ 8675.446705] do_syscall_64+0x50/0x210 + [ 8675.448059] entry_SYSCALL_64_after_hwframe+0x49/0xbe + [ 8675.479187] BTRFS: error (device vdb) in btrfs_create_pending_block_groups:2038: errno=-28 No space left + +We now use btrfs_can_overcommit() to see if we can flip a block group +read only. Before this would fail because we weren't taking into +account the usable un-allocated space for allocating chunks. With my +patches we were allowed to do the balance, which is technically correct. + +The test is trying to start balance on degraded mount. So now we're +trying to allocate a chunk and cannot because we want to allocate a +RAID1 chunk, but there's only 1 device that's available for usage. This +results in an ENOSPC. + +But we shouldn't even be making it this far, we don't have enough +devices to restripe. The problem is we're using btrfs_num_devices(), +that also includes missing devices. That's not actually what we want, we +need to use rw_devices. + +The chunk_mutex is not needed here, rw_devices changes only in device +add, remove or replace, all are excluded by EXCL_OP mechanism. + +Fixes: e4d8ec0f65b9 ("Btrfs: implement online profile changing") +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +[ add stacktrace, update changelog, drop chunk_mutex ] +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/volumes.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -4066,7 +4066,11 @@ int btrfs_balance(struct btrfs_fs_info * + } + } + +- num_devices = btrfs_num_devices(fs_info); ++ /* ++ * rw_devices will not change at the moment, device add/delete/replace ++ * are excluded by EXCL_OP ++ */ ++ num_devices = fs_info->fs_devices->rw_devices; + + /* + * SINGLE profile on-disk has no profile bit, but in-memory we have a diff --git a/queue-5.4/btrfs-do-not-delete-mismatched-root-refs.patch b/queue-5.4/btrfs-do-not-delete-mismatched-root-refs.patch new file mode 100644 index 00000000000..add0eeeae9d --- /dev/null +++ b/queue-5.4/btrfs-do-not-delete-mismatched-root-refs.patch @@ -0,0 +1,45 @@ +From 423a716cd7be16fb08690760691befe3be97d3fc Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Wed, 18 Dec 2019 17:20:29 -0500 +Subject: btrfs: do not delete mismatched root refs + +From: Josef Bacik + +commit 423a716cd7be16fb08690760691befe3be97d3fc upstream. + +btrfs_del_root_ref() will simply WARN_ON() if the ref doesn't match in +any way, and then continue to delete the reference. This shouldn't +happen, we have these values because there's more to the reference than +the original root and the sub root. If any of these checks fail, return +-ENOENT. + +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/root-tree.c | 10 ++++++---- + 1 file changed, 6 insertions(+), 4 deletions(-) + +--- a/fs/btrfs/root-tree.c ++++ b/fs/btrfs/root-tree.c +@@ -376,11 +376,13 @@ again: + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_root_ref); +- +- WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid); +- WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len); + ptr = (unsigned long)(ref + 1); +- WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len)); ++ if ((btrfs_root_ref_dirid(leaf, ref) != dirid) || ++ (btrfs_root_ref_name_len(leaf, ref) != name_len) || ++ memcmp_extent_buffer(leaf, name, ptr, name_len)) { ++ err = -ENOENT; ++ goto out; ++ } + *sequence = btrfs_root_ref_sequence(leaf, ref); + + ret = btrfs_del_item(trans, tree_root, path); diff --git a/queue-5.4/btrfs-fix-invalid-removal-of-root-ref.patch b/queue-5.4/btrfs-fix-invalid-removal-of-root-ref.patch new file mode 100644 index 00000000000..9bd2b2cfc38 --- /dev/null +++ b/queue-5.4/btrfs-fix-invalid-removal-of-root-ref.patch @@ -0,0 +1,89 @@ +From d49d3287e74ffe55ae7430d1e795e5f9bf7359ea Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Wed, 18 Dec 2019 17:20:28 -0500 +Subject: btrfs: fix invalid removal of root ref + +From: Josef Bacik + +commit d49d3287e74ffe55ae7430d1e795e5f9bf7359ea upstream. + +If we have the following sequence of events + + btrfs sub create A + btrfs sub create A/B + btrfs sub snap A C + mkdir C/foo + mv A/B C/foo + rm -rf * + +We will end up with a transaction abort. + +The reason for this is because we create a root ref for B pointing to A. +When we create a snapshot of C we still have B in our tree, but because +the root ref points to A and not C we will make it appear to be empty. + +The problem happens when we move B into C. This removes the root ref +for B pointing to A and adds a ref of B pointing to C. When we rmdir C +we'll see that we have a ref to our root and remove the root ref, +despite not actually matching our reference name. + +Now btrfs_del_root_ref() allowing this to work is a bug as well, however +we know that this inode does not actually point to a root ref in the +first place, so we shouldn't be calling btrfs_del_root_ref() in the +first place and instead simply look up our dir index for this item and +do the rest of the removal. + +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/inode.c | 27 +++++++++++++++++++-------- + 1 file changed, 19 insertions(+), 8 deletions(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -4248,13 +4248,16 @@ static int btrfs_unlink_subvol(struct bt + } + btrfs_release_path(path); + +- ret = btrfs_del_root_ref(trans, objectid, root->root_key.objectid, +- dir_ino, &index, name, name_len); +- if (ret < 0) { +- if (ret != -ENOENT) { +- btrfs_abort_transaction(trans, ret); +- goto out; +- } ++ /* ++ * This is a placeholder inode for a subvolume we didn't have a ++ * reference to at the time of the snapshot creation. In the meantime ++ * we could have renamed the real subvol link into our snapshot, so ++ * depending on btrfs_del_root_ref to return -ENOENT here is incorret. ++ * Instead simply lookup the dir_index_item for this entry so we can ++ * remove it. Otherwise we know we have a ref to the root and we can ++ * call btrfs_del_root_ref, and it _shouldn't_ fail. ++ */ ++ if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { + di = btrfs_search_dir_index_item(root, path, dir_ino, + name, name_len); + if (IS_ERR_OR_NULL(di)) { +@@ -4269,8 +4272,16 @@ static int btrfs_unlink_subvol(struct bt + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + index = key.offset; ++ btrfs_release_path(path); ++ } else { ++ ret = btrfs_del_root_ref(trans, objectid, ++ root->root_key.objectid, dir_ino, ++ &index, name, name_len); ++ if (ret) { ++ btrfs_abort_transaction(trans, ret); ++ goto out; ++ } + } +- btrfs_release_path(path); + + ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index); + if (ret) { diff --git a/queue-5.4/btrfs-fix-memory-leak-in-qgroup-accounting.patch b/queue-5.4/btrfs-fix-memory-leak-in-qgroup-accounting.patch new file mode 100644 index 00000000000..e16f24c9cf7 --- /dev/null +++ b/queue-5.4/btrfs-fix-memory-leak-in-qgroup-accounting.patch @@ -0,0 +1,80 @@ +From 26ef8493e1ab771cb01d27defca2fa1315dc3980 Mon Sep 17 00:00:00 2001 +From: Johannes Thumshirn +Date: Wed, 8 Jan 2020 21:07:32 +0900 +Subject: btrfs: fix memory leak in qgroup accounting + +From: Johannes Thumshirn + +commit 26ef8493e1ab771cb01d27defca2fa1315dc3980 upstream. + +When running xfstests on the current btrfs I get the following splat from +kmemleak: + +unreferenced object 0xffff88821b2404e0 (size 32): + comm "kworker/u4:7", pid 26663, jiffies 4295283698 (age 8.776s) + hex dump (first 32 bytes): + 01 00 00 00 00 00 00 00 10 ff fd 26 82 88 ff ff ...........&.... + 10 ff fd 26 82 88 ff ff 20 ff fd 26 82 88 ff ff ...&.... ..&.... + backtrace: + [<00000000f94fd43f>] ulist_alloc+0x25/0x60 [btrfs] + [<00000000fd023d99>] btrfs_find_all_roots_safe+0x41/0x100 [btrfs] + [<000000008f17bd32>] btrfs_find_all_roots+0x52/0x70 [btrfs] + [<00000000b7660afb>] btrfs_qgroup_rescan_worker+0x343/0x680 [btrfs] + [<0000000058e66778>] btrfs_work_helper+0xac/0x1e0 [btrfs] + [<00000000f0188930>] process_one_work+0x1cf/0x350 + [<00000000af5f2f8e>] worker_thread+0x28/0x3c0 + [<00000000b55a1add>] kthread+0x109/0x120 + [<00000000f88cbd17>] ret_from_fork+0x35/0x40 + +This corresponds to: + + (gdb) l *(btrfs_find_all_roots_safe+0x41) + 0x8d7e1 is in btrfs_find_all_roots_safe (fs/btrfs/backref.c:1413). + 1408 + 1409 tmp = ulist_alloc(GFP_NOFS); + 1410 if (!tmp) + 1411 return -ENOMEM; + 1412 *roots = ulist_alloc(GFP_NOFS); + 1413 if (!*roots) { + 1414 ulist_free(tmp); + 1415 return -ENOMEM; + 1416 } + 1417 + +Following the lifetime of the allocated 'roots' ulist, it gets freed +again in btrfs_qgroup_account_extent(). + +But this does not happen if the function is called with the +'BTRFS_FS_QUOTA_ENABLED' flag cleared, then btrfs_qgroup_account_extent() +does a short leave and directly returns. + +Instead of directly returning we should jump to the 'out_free' in order to +free all resources as expected. + +CC: stable@vger.kernel.org # 4.14+ +Reviewed-by: Qu Wenruo +Signed-off-by: Johannes Thumshirn +[ add comment ] +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/qgroup.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -2423,8 +2423,12 @@ int btrfs_qgroup_account_extent(struct b + u64 nr_old_roots = 0; + int ret = 0; + ++ /* ++ * If quotas get disabled meanwhile, the resouces need to be freed and ++ * we can't just exit here. ++ */ + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) +- return 0; ++ goto out_free; + + if (new_roots) { + if (!maybe_fs_roots(new_roots)) diff --git a/queue-5.4/btrfs-relocation-fix-reloc_root-lifespan-and-access.patch b/queue-5.4/btrfs-relocation-fix-reloc_root-lifespan-and-access.patch new file mode 100644 index 00000000000..7892cc3c0f3 --- /dev/null +++ b/queue-5.4/btrfs-relocation-fix-reloc_root-lifespan-and-access.patch @@ -0,0 +1,234 @@ +From 6282675e6708ec78518cc0e9ad1f1f73d7c5c53d Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Wed, 8 Jan 2020 13:12:00 +0800 +Subject: btrfs: relocation: fix reloc_root lifespan and access + +From: Qu Wenruo + +commit 6282675e6708ec78518cc0e9ad1f1f73d7c5c53d upstream. + +[BUG] +There are several different KASAN reports for balance + snapshot +workloads. Involved call paths include: + + should_ignore_root+0x54/0xb0 [btrfs] + build_backref_tree+0x11af/0x2280 [btrfs] + relocate_tree_blocks+0x391/0xb80 [btrfs] + relocate_block_group+0x3e5/0xa00 [btrfs] + btrfs_relocate_block_group+0x240/0x4d0 [btrfs] + btrfs_relocate_chunk+0x53/0xf0 [btrfs] + btrfs_balance+0xc91/0x1840 [btrfs] + btrfs_ioctl_balance+0x416/0x4e0 [btrfs] + btrfs_ioctl+0x8af/0x3e60 [btrfs] + do_vfs_ioctl+0x831/0xb10 + + create_reloc_root+0x9f/0x460 [btrfs] + btrfs_reloc_post_snapshot+0xff/0x6c0 [btrfs] + create_pending_snapshot+0xa9b/0x15f0 [btrfs] + create_pending_snapshots+0x111/0x140 [btrfs] + btrfs_commit_transaction+0x7a6/0x1360 [btrfs] + btrfs_mksubvol+0x915/0x960 [btrfs] + btrfs_ioctl_snap_create_transid+0x1d5/0x1e0 [btrfs] + btrfs_ioctl_snap_create_v2+0x1d3/0x270 [btrfs] + btrfs_ioctl+0x241b/0x3e60 [btrfs] + do_vfs_ioctl+0x831/0xb10 + + btrfs_reloc_pre_snapshot+0x85/0xc0 [btrfs] + create_pending_snapshot+0x209/0x15f0 [btrfs] + create_pending_snapshots+0x111/0x140 [btrfs] + btrfs_commit_transaction+0x7a6/0x1360 [btrfs] + btrfs_mksubvol+0x915/0x960 [btrfs] + btrfs_ioctl_snap_create_transid+0x1d5/0x1e0 [btrfs] + btrfs_ioctl_snap_create_v2+0x1d3/0x270 [btrfs] + btrfs_ioctl+0x241b/0x3e60 [btrfs] + do_vfs_ioctl+0x831/0xb10 + +[CAUSE] +All these call sites are only relying on root->reloc_root, which can +undergo btrfs_drop_snapshot(), and since we don't have real refcount +based protection to reloc roots, we can reach already dropped reloc +root, triggering KASAN. + +[FIX] +To avoid such access to unstable root->reloc_root, we should check +BTRFS_ROOT_DEAD_RELOC_TREE bit first. + +This patch introduces wrappers that provide the correct way to check the +bit with memory barriers protection. + +Most callers don't distinguish merged reloc tree and no reloc tree. The +only exception is should_ignore_root(), as merged reloc tree can be +ignored, while no reloc tree shouldn't. + +[CRITICAL SECTION ANALYSIS] +Although test_bit()/set_bit()/clear_bit() doesn't imply a barrier, the +DEAD_RELOC_TREE bit has extra help from transaction as a higher level +barrier, the lifespan of root::reloc_root and DEAD_RELOC_TREE bit are: + + NULL: reloc_root is NULL PTR: reloc_root is not NULL + 0: DEAD_RELOC_ROOT bit not set DEAD: DEAD_RELOC_ROOT bit set + + (NULL, 0) Initial state __ + | /\ Section A + btrfs_init_reloc_root() \/ + | __ + (PTR, 0) reloc_root initialized /\ + | | + btrfs_update_reloc_root() | Section B + | | + (PTR, DEAD) reloc_root has been merged \/ + | __ + === btrfs_commit_transaction() ==================== + | /\ + clean_dirty_subvols() | + | | Section C + (NULL, DEAD) reloc_root cleanup starts \/ + | __ + btrfs_drop_snapshot() /\ + | | Section D + (NULL, 0) Back to initial state \/ + +Every have_reloc_root() or test_bit(DEAD_RELOC_ROOT) caller holds +transaction handle, so none of such caller can cross transaction boundary. + +In Section A, every caller just found no DEAD bit, and grab reloc_root. + +In the cross section A-B, caller may get no DEAD bit, but since reloc_root +is still completely valid thus accessing reloc_root is completely safe. + +No test_bit() caller can cross the boundary of Section B and Section C. + +In Section C, every caller found the DEAD bit, so no one will access +reloc_root. + +In the cross section C-D, either caller gets the DEAD bit set, avoiding +access reloc_root no matter if it's safe or not. Or caller get the DEAD +bit cleared, then access reloc_root, which is already NULL, nothing will +be wrong. + +The memory write barriers are between the reloc_root updates and bit +set/clear, the pairing read side is before test_bit. + +Reported-by: Zygo Blaxell +Fixes: d2311e698578 ("btrfs: relocation: Delay reloc tree deletion after merge_reloc_roots") +CC: stable@vger.kernel.org # 5.4+ +Reviewed-by: Josef Bacik +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +[ barriers ] +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/relocation.c | 51 +++++++++++++++++++++++++++++++++++++++++++++----- + 1 file changed, 46 insertions(+), 5 deletions(-) + +--- a/fs/btrfs/relocation.c ++++ b/fs/btrfs/relocation.c +@@ -517,6 +517,34 @@ static int update_backref_cache(struct b + return 1; + } + ++static bool reloc_root_is_dead(struct btrfs_root *root) ++{ ++ /* ++ * Pair with set_bit/clear_bit in clean_dirty_subvols and ++ * btrfs_update_reloc_root. We need to see the updated bit before ++ * trying to access reloc_root ++ */ ++ smp_rmb(); ++ if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) ++ return true; ++ return false; ++} ++ ++/* ++ * Check if this subvolume tree has valid reloc tree. ++ * ++ * Reloc tree after swap is considered dead, thus not considered as valid. ++ * This is enough for most callers, as they don't distinguish dead reloc root ++ * from no reloc root. But should_ignore_root() below is a special case. ++ */ ++static bool have_reloc_root(struct btrfs_root *root) ++{ ++ if (reloc_root_is_dead(root)) ++ return false; ++ if (!root->reloc_root) ++ return false; ++ return true; ++} + + static int should_ignore_root(struct btrfs_root *root) + { +@@ -525,6 +553,10 @@ static int should_ignore_root(struct btr + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + return 0; + ++ /* This root has been merged with its reloc tree, we can ignore it */ ++ if (reloc_root_is_dead(root)) ++ return 1; ++ + reloc_root = root->reloc_root; + if (!reloc_root) + return 0; +@@ -1439,7 +1471,7 @@ int btrfs_init_reloc_root(struct btrfs_t + * The subvolume has reloc tree but the swap is finished, no need to + * create/update the dead reloc tree + */ +- if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) ++ if (reloc_root_is_dead(root)) + return 0; + + if (root->reloc_root) { +@@ -1478,8 +1510,7 @@ int btrfs_update_reloc_root(struct btrfs + struct btrfs_root_item *root_item; + int ret; + +- if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state) || +- !root->reloc_root) ++ if (!have_reloc_root(root)) + goto out; + + reloc_root = root->reloc_root; +@@ -1489,6 +1520,11 @@ int btrfs_update_reloc_root(struct btrfs + if (fs_info->reloc_ctl->merge_reloc_tree && + btrfs_root_refs(root_item) == 0) { + set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); ++ /* ++ * Mark the tree as dead before we change reloc_root so ++ * have_reloc_root will not touch it from now on. ++ */ ++ smp_wmb(); + __del_reloc_root(reloc_root); + } + +@@ -2202,6 +2238,11 @@ static int clean_dirty_subvols(struct re + if (ret2 < 0 && !ret) + ret = ret2; + } ++ /* ++ * Need barrier to ensure clear_bit() only happens after ++ * root->reloc_root = NULL. Pairs with have_reloc_root. ++ */ ++ smp_wmb(); + clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); + btrfs_put_fs_root(root); + } else { +@@ -4721,7 +4762,7 @@ void btrfs_reloc_pre_snapshot(struct btr + struct btrfs_root *root = pending->root; + struct reloc_control *rc = root->fs_info->reloc_ctl; + +- if (!root->reloc_root || !rc) ++ if (!rc || !have_reloc_root(root)) + return; + + if (!rc->merge_reloc_tree) +@@ -4755,7 +4796,7 @@ int btrfs_reloc_post_snapshot(struct btr + struct reloc_control *rc = root->fs_info->reloc_ctl; + int ret; + +- if (!root->reloc_root || !rc) ++ if (!rc || !have_reloc_root(root)) + return 0; + + rc = root->fs_info->reloc_ctl; diff --git a/queue-5.4/mm-debug_pagealloc-don-t-rely-on-static-keys-too-early.patch b/queue-5.4/mm-debug_pagealloc-don-t-rely-on-static-keys-too-early.patch new file mode 100644 index 00000000000..774c371cc95 --- /dev/null +++ b/queue-5.4/mm-debug_pagealloc-don-t-rely-on-static-keys-too-early.patch @@ -0,0 +1,263 @@ +From 8e57f8acbbd121ecfb0c9dc13b8b030f86c6bd3b Mon Sep 17 00:00:00 2001 +From: Vlastimil Babka +Date: Mon, 13 Jan 2020 16:29:20 -0800 +Subject: mm, debug_pagealloc: don't rely on static keys too early + +From: Vlastimil Babka + +commit 8e57f8acbbd121ecfb0c9dc13b8b030f86c6bd3b upstream. + +Commit 96a2b03f281d ("mm, debug_pagelloc: use static keys to enable +debugging") has introduced a static key to reduce overhead when +debug_pagealloc is compiled in but not enabled. It relied on the +assumption that jump_label_init() is called before parse_early_param() +as in start_kernel(), so when the "debug_pagealloc=on" option is parsed, +it is safe to enable the static key. + +However, it turns out multiple architectures call parse_early_param() +earlier from their setup_arch(). x86 also calls jump_label_init() even +earlier, so no issue was found while testing the commit, but same is not +true for e.g. ppc64 and s390 where the kernel would not boot with +debug_pagealloc=on as found by our QA. + +To fix this without tricky changes to init code of multiple +architectures, this patch partially reverts the static key conversion +from 96a2b03f281d. Init-time and non-fastpath calls (such as in arch +code) of debug_pagealloc_enabled() will again test a simple bool +variable. Fastpath mm code is converted to a new +debug_pagealloc_enabled_static() variant that relies on the static key, +which is enabled in a well-defined point in mm_init() where it's +guaranteed that jump_label_init() has been called, regardless of +architecture. + +[sfr@canb.auug.org.au: export _debug_pagealloc_enabled_early] + Link: http://lkml.kernel.org/r/20200106164944.063ac07b@canb.auug.org.au +Link: http://lkml.kernel.org/r/20191219130612.23171-1-vbabka@suse.cz +Fixes: 96a2b03f281d ("mm, debug_pagelloc: use static keys to enable debugging") +Signed-off-by: Vlastimil Babka +Signed-off-by: Stephen Rothwell +Cc: Joonsoo Kim +Cc: "Kirill A. Shutemov" +Cc: Michal Hocko +Cc: Vlastimil Babka +Cc: Matthew Wilcox +Cc: Mel Gorman +Cc: Peter Zijlstra +Cc: Borislav Petkov +Cc: Qian Cai +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/mm.h | 18 +++++++++++++++--- + init/main.c | 1 + + mm/page_alloc.c | 37 +++++++++++++------------------------ + mm/slab.c | 4 ++-- + mm/slub.c | 2 +- + mm/vmalloc.c | 4 ++-- + 6 files changed, 34 insertions(+), 32 deletions(-) + +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -2666,14 +2666,26 @@ static inline bool want_init_on_free(voi + !page_poisoning_enabled(); + } + +-#ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT +-DECLARE_STATIC_KEY_TRUE(_debug_pagealloc_enabled); ++#ifdef CONFIG_DEBUG_PAGEALLOC ++extern void init_debug_pagealloc(void); + #else +-DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); ++static inline void init_debug_pagealloc(void) {} + #endif ++extern bool _debug_pagealloc_enabled_early; ++DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); + + static inline bool debug_pagealloc_enabled(void) + { ++ return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) && ++ _debug_pagealloc_enabled_early; ++} ++ ++/* ++ * For use in fast paths after init_debug_pagealloc() has run, or when a ++ * false negative result is not harmful when called too early. ++ */ ++static inline bool debug_pagealloc_enabled_static(void) ++{ + if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) + return false; + +--- a/init/main.c ++++ b/init/main.c +@@ -553,6 +553,7 @@ static void __init mm_init(void) + * bigger than MAX_ORDER unless SPARSEMEM. + */ + page_ext_init_flatmem(); ++ init_debug_pagealloc(); + report_meminit(); + mem_init(); + kmem_cache_init(); +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -694,34 +694,27 @@ void prep_compound_page(struct page *pag + #ifdef CONFIG_DEBUG_PAGEALLOC + unsigned int _debug_guardpage_minorder; + +-#ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT +-DEFINE_STATIC_KEY_TRUE(_debug_pagealloc_enabled); +-#else ++bool _debug_pagealloc_enabled_early __read_mostly ++ = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); ++EXPORT_SYMBOL(_debug_pagealloc_enabled_early); + DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); +-#endif + EXPORT_SYMBOL(_debug_pagealloc_enabled); + + DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled); + + static int __init early_debug_pagealloc(char *buf) + { +- bool enable = false; +- +- if (kstrtobool(buf, &enable)) +- return -EINVAL; +- +- if (enable) +- static_branch_enable(&_debug_pagealloc_enabled); +- +- return 0; ++ return kstrtobool(buf, &_debug_pagealloc_enabled_early); + } + early_param("debug_pagealloc", early_debug_pagealloc); + +-static void init_debug_guardpage(void) ++void init_debug_pagealloc(void) + { + if (!debug_pagealloc_enabled()) + return; + ++ static_branch_enable(&_debug_pagealloc_enabled); ++ + if (!debug_guardpage_minorder()) + return; + +@@ -1186,7 +1179,7 @@ static __always_inline bool free_pages_p + */ + arch_free_page(page, order); + +- if (debug_pagealloc_enabled()) ++ if (debug_pagealloc_enabled_static()) + kernel_map_pages(page, 1 << order, 0); + + kasan_free_nondeferred_pages(page, order); +@@ -1207,7 +1200,7 @@ static bool free_pcp_prepare(struct page + + static bool bulkfree_pcp_prepare(struct page *page) + { +- if (debug_pagealloc_enabled()) ++ if (debug_pagealloc_enabled_static()) + return free_pages_check(page); + else + return false; +@@ -1221,7 +1214,7 @@ static bool bulkfree_pcp_prepare(struct + */ + static bool free_pcp_prepare(struct page *page) + { +- if (debug_pagealloc_enabled()) ++ if (debug_pagealloc_enabled_static()) + return free_pages_prepare(page, 0, true); + else + return free_pages_prepare(page, 0, false); +@@ -1973,10 +1966,6 @@ void __init page_alloc_init_late(void) + + for_each_populated_zone(zone) + set_zone_contiguous(zone); +- +-#ifdef CONFIG_DEBUG_PAGEALLOC +- init_debug_guardpage(); +-#endif + } + + #ifdef CONFIG_CMA +@@ -2106,7 +2095,7 @@ static inline bool free_pages_prezeroed( + */ + static inline bool check_pcp_refill(struct page *page) + { +- if (debug_pagealloc_enabled()) ++ if (debug_pagealloc_enabled_static()) + return check_new_page(page); + else + return false; +@@ -2128,7 +2117,7 @@ static inline bool check_pcp_refill(stru + } + static inline bool check_new_pcp(struct page *page) + { +- if (debug_pagealloc_enabled()) ++ if (debug_pagealloc_enabled_static()) + return check_new_page(page); + else + return false; +@@ -2155,7 +2144,7 @@ inline void post_alloc_hook(struct page + set_page_refcounted(page); + + arch_alloc_page(page, order); +- if (debug_pagealloc_enabled()) ++ if (debug_pagealloc_enabled_static()) + kernel_map_pages(page, 1 << order, 1); + kasan_alloc_pages(page, order); + kernel_poison_pages(page, 1 << order, 1); +--- a/mm/slab.c ++++ b/mm/slab.c +@@ -1415,7 +1415,7 @@ static void kmem_rcu_free(struct rcu_hea + #if DEBUG + static bool is_debug_pagealloc_cache(struct kmem_cache *cachep) + { +- if (debug_pagealloc_enabled() && OFF_SLAB(cachep) && ++ if (debug_pagealloc_enabled_static() && OFF_SLAB(cachep) && + (cachep->size % PAGE_SIZE) == 0) + return true; + +@@ -2007,7 +2007,7 @@ int __kmem_cache_create(struct kmem_cach + * to check size >= 256. It guarantees that all necessary small + * sized slab is initialized in current slab initialization sequence. + */ +- if (debug_pagealloc_enabled() && (flags & SLAB_POISON) && ++ if (debug_pagealloc_enabled_static() && (flags & SLAB_POISON) && + size >= 256 && cachep->object_size > cache_line_size()) { + if (size < PAGE_SIZE || size % PAGE_SIZE == 0) { + size_t tmp_size = ALIGN(size, PAGE_SIZE); +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -290,7 +290,7 @@ static inline void *get_freepointer_safe + unsigned long freepointer_addr; + void *p; + +- if (!debug_pagealloc_enabled()) ++ if (!debug_pagealloc_enabled_static()) + return get_freepointer(s, object); + + freepointer_addr = (unsigned long)object + s->offset; +--- a/mm/vmalloc.c ++++ b/mm/vmalloc.c +@@ -1349,7 +1349,7 @@ static void free_unmap_vmap_area(struct + { + flush_cache_vunmap(va->va_start, va->va_end); + unmap_vmap_area(va); +- if (debug_pagealloc_enabled()) ++ if (debug_pagealloc_enabled_static()) + flush_tlb_kernel_range(va->va_start, va->va_end); + + free_vmap_area_noflush(va); +@@ -1647,7 +1647,7 @@ static void vb_free(const void *addr, un + + vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); + +- if (debug_pagealloc_enabled()) ++ if (debug_pagealloc_enabled_static()) + flush_tlb_kernel_range((unsigned long)addr, + (unsigned long)addr + size); + diff --git a/queue-5.4/mm-huge_memory.c-thp-fix-conflict-of-above-47bit-hint-address-and-pmd-alignment.patch b/queue-5.4/mm-huge_memory.c-thp-fix-conflict-of-above-47bit-hint-address-and-pmd-alignment.patch new file mode 100644 index 00000000000..c49ad4afc0f --- /dev/null +++ b/queue-5.4/mm-huge_memory.c-thp-fix-conflict-of-above-47bit-hint-address-and-pmd-alignment.patch @@ -0,0 +1,134 @@ +From 97d3d0f9a1cf132c63c0b8b8bd497b8a56283dd9 Mon Sep 17 00:00:00 2001 +From: "Kirill A. Shutemov" +Date: Mon, 13 Jan 2020 16:29:10 -0800 +Subject: mm/huge_memory.c: thp: fix conflict of above-47bit hint address and PMD alignment + +From: Kirill A. Shutemov + +commit 97d3d0f9a1cf132c63c0b8b8bd497b8a56283dd9 upstream. + +Patch series "Fix two above-47bit hint address vs. THP bugs". + +The two get_unmapped_area() implementations have to be fixed to provide +THP-friendly mappings if above-47bit hint address is specified. + +This patch (of 2): + +Filesystems use thp_get_unmapped_area() to provide THP-friendly +mappings. For DAX in particular. + +Normally, the kernel doesn't create userspace mappings above 47-bit, +even if the machine allows this (such as with 5-level paging on x86-64). +Not all user space is ready to handle wide addresses. It's known that +at least some JIT compilers use higher bits in pointers to encode their +information. + +Userspace can ask for allocation from full address space by specifying +hint address (with or without MAP_FIXED) above 47-bits. If the +application doesn't need a particular address, but wants to allocate +from whole address space it can specify -1 as a hint address. + +Unfortunately, this trick breaks thp_get_unmapped_area(): the function +would not try to allocate PMD-aligned area if *any* hint address +specified. + +Modify the routine to handle it correctly: + + - Try to allocate the space at the specified hint address with length + padding required for PMD alignment. + - If failed, retry without length padding (but with the same hint + address); + - If the returned address matches the hint address return it. + - Otherwise, align the address as required for THP and return. + +The user specified hint address is passed down to get_unmapped_area() so +above-47bit hint address will be taken into account without breaking +alignment requirements. + +Link: http://lkml.kernel.org/r/20191220142548.7118-2-kirill.shutemov@linux.intel.com +Fixes: b569bab78d8d ("x86/mm: Prepare to expose larger address space to userspace") +Signed-off-by: Kirill A. Shutemov +Reported-by: Thomas Willhalm +Tested-by: Dan Williams +Cc: "Aneesh Kumar K . V" +Cc: "Bruggeman, Otto G" +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/huge_memory.c | 38 ++++++++++++++++++++++++-------------- + 1 file changed, 24 insertions(+), 14 deletions(-) + +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -527,13 +527,13 @@ void prep_transhuge_page(struct page *pa + set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); + } + +-static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len, ++static unsigned long __thp_get_unmapped_area(struct file *filp, ++ unsigned long addr, unsigned long len, + loff_t off, unsigned long flags, unsigned long size) + { +- unsigned long addr; + loff_t off_end = off + len; + loff_t off_align = round_up(off, size); +- unsigned long len_pad; ++ unsigned long len_pad, ret; + + if (off_end <= off_align || (off_end - off_align) < size) + return 0; +@@ -542,30 +542,40 @@ static unsigned long __thp_get_unmapped_ + if (len_pad < len || (off + len_pad) < off) + return 0; + +- addr = current->mm->get_unmapped_area(filp, 0, len_pad, ++ ret = current->mm->get_unmapped_area(filp, addr, len_pad, + off >> PAGE_SHIFT, flags); +- if (IS_ERR_VALUE(addr)) ++ ++ /* ++ * The failure might be due to length padding. The caller will retry ++ * without the padding. ++ */ ++ if (IS_ERR_VALUE(ret)) + return 0; + +- addr += (off - addr) & (size - 1); +- return addr; ++ /* ++ * Do not try to align to THP boundary if allocation at the address ++ * hint succeeds. ++ */ ++ if (ret == addr) ++ return addr; ++ ++ ret += (off - ret) & (size - 1); ++ return ret; + } + + unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) + { ++ unsigned long ret; + loff_t off = (loff_t)pgoff << PAGE_SHIFT; + +- if (addr) +- goto out; + if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD)) + goto out; + +- addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE); +- if (addr) +- return addr; +- +- out: ++ ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE); ++ if (ret) ++ return ret; ++out: + return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); + } + EXPORT_SYMBOL_GPL(thp_get_unmapped_area); diff --git a/queue-5.4/mm-khugepaged-add-trace-status-description-for-scan_page_has_private.patch b/queue-5.4/mm-khugepaged-add-trace-status-description-for-scan_page_has_private.patch new file mode 100644 index 00000000000..3921f2e610c --- /dev/null +++ b/queue-5.4/mm-khugepaged-add-trace-status-description-for-scan_page_has_private.patch @@ -0,0 +1,39 @@ +From 554913f600b45d73de12ad58c1ac7baa0f22a703 Mon Sep 17 00:00:00 2001 +From: Yang Shi +Date: Mon, 13 Jan 2020 16:29:36 -0800 +Subject: mm: khugepaged: add trace status description for SCAN_PAGE_HAS_PRIVATE + +From: Yang Shi + +commit 554913f600b45d73de12ad58c1ac7baa0f22a703 upstream. + +Commit 99cb0dbd47a1 ("mm,thp: add read-only THP support for (non-shmem) +FS") introduced a new khugepaged scan result: SCAN_PAGE_HAS_PRIVATE, but +the corresponding description for trace events were not added. + +Link: http://lkml.kernel.org/r/1574793844-2914-1-git-send-email-yang.shi@linux.alibaba.com +Fixes: 99cb0dbd47a1 ("mm,thp: add read-only THP support for (non-shmem) FS") +Signed-off-by: Yang Shi +Cc: Song Liu +Cc: Kirill A. Shutemov +Cc: Anshuman Khandual +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/trace/events/huge_memory.h | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/include/trace/events/huge_memory.h ++++ b/include/trace/events/huge_memory.h +@@ -31,7 +31,8 @@ + EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \ + EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \ + EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \ +- EMe(SCAN_TRUNCATED, "truncated") \ ++ EM( SCAN_TRUNCATED, "truncated") \ ++ EMe(SCAN_PAGE_HAS_PRIVATE, "page_has_private") \ + + #undef EM + #undef EMe diff --git a/queue-5.4/mm-memcg-slab-call-flush_memcg_workqueue-only-if-memcg-workqueue-is-valid.patch b/queue-5.4/mm-memcg-slab-call-flush_memcg_workqueue-only-if-memcg-workqueue-is-valid.patch new file mode 100644 index 00000000000..b507d78ee27 --- /dev/null +++ b/queue-5.4/mm-memcg-slab-call-flush_memcg_workqueue-only-if-memcg-workqueue-is-valid.patch @@ -0,0 +1,85 @@ +From 2fe20210fc5f5e62644678b8f927c49f2c6f42a7 Mon Sep 17 00:00:00 2001 +From: Adrian Huang +Date: Mon, 13 Jan 2020 16:29:32 -0800 +Subject: mm: memcg/slab: call flush_memcg_workqueue() only if memcg workqueue is valid + +From: Adrian Huang + +commit 2fe20210fc5f5e62644678b8f927c49f2c6f42a7 upstream. + +When booting with amd_iommu=off, the following WARNING message +appears: + + AMD-Vi: AMD IOMMU disabled on kernel command-line + ------------[ cut here ]------------ + WARNING: CPU: 0 PID: 0 at kernel/workqueue.c:2772 flush_workqueue+0x42e/0x450 + Modules linked in: + CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.5.0-rc3-amd-iommu #6 + Hardware name: Lenovo ThinkSystem SR655-2S/7D2WRCZ000, BIOS D8E101L-1.00 12/05/2019 + RIP: 0010:flush_workqueue+0x42e/0x450 + Code: ff 0f 0b e9 7a fd ff ff 4d 89 ef e9 33 fe ff ff 0f 0b e9 7f fd ff ff 0f 0b e9 bc fd ff ff 0f 0b e9 a8 fd ff ff e8 52 2c fe ff <0f> 0b 31 d2 48 c7 c6 e0 88 c5 95 48 c7 c7 d8 ad f0 95 e8 19 f5 04 + Call Trace: + kmem_cache_destroy+0x69/0x260 + iommu_go_to_state+0x40c/0x5ab + amd_iommu_prepare+0x16/0x2a + irq_remapping_prepare+0x36/0x5f + enable_IR_x2apic+0x21/0x172 + default_setup_apic_routing+0x12/0x6f + apic_intr_mode_init+0x1a1/0x1f1 + x86_late_time_init+0x17/0x1c + start_kernel+0x480/0x53f + secondary_startup_64+0xb6/0xc0 + ---[ end trace 30894107c3749449 ]--- + x2apic: IRQ remapping doesn't support X2APIC mode + x2apic disabled + +The warning is caused by the calling of 'kmem_cache_destroy()' +in free_iommu_resources(). Here is the call path: + + free_iommu_resources + kmem_cache_destroy + flush_memcg_workqueue + flush_workqueue + +The root cause is that the IOMMU subsystem runs before the workqueue +subsystem, which the variable 'wq_online' is still 'false'. This leads +to the statement 'if (WARN_ON(!wq_online))' in flush_workqueue() is +'true'. + +Since the variable 'memcg_kmem_cache_wq' is not allocated during the +time, it is unnecessary to call flush_memcg_workqueue(). This prevents +the WARNING message triggered by flush_workqueue(). + +Link: http://lkml.kernel.org/r/20200103085503.1665-1-ahuang12@lenovo.com +Fixes: 92ee383f6daab ("mm: fix race between kmem_cache destroy, create and deactivate") +Signed-off-by: Adrian Huang +Reported-by: Xiaochun Lee +Reviewed-by: Shakeel Butt +Cc: Joerg Roedel +Cc: Christoph Lameter +Cc: Pekka Enberg +Cc: David Rientjes +Cc: Joonsoo Kim +Cc: Michal Hocko +Cc: Johannes Weiner +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/slab_common.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/mm/slab_common.c ++++ b/mm/slab_common.c +@@ -903,7 +903,8 @@ static void flush_memcg_workqueue(struct + * deactivates the memcg kmem_caches through workqueue. Make sure all + * previous workitems on workqueue are processed. + */ +- flush_workqueue(memcg_kmem_cache_wq); ++ if (likely(memcg_kmem_cache_wq)) ++ flush_workqueue(memcg_kmem_cache_wq); + + /* + * If we're racing with children kmem_cache deactivation, it might diff --git a/queue-5.4/mm-memcg-slab-fix-percpu-slab-vmstats-flushing.patch b/queue-5.4/mm-memcg-slab-fix-percpu-slab-vmstats-flushing.patch new file mode 100644 index 00000000000..6d4913ba568 --- /dev/null +++ b/queue-5.4/mm-memcg-slab-fix-percpu-slab-vmstats-flushing.patch @@ -0,0 +1,178 @@ +From 4a87e2a25dc27131c3cce5e94421622193305638 Mon Sep 17 00:00:00 2001 +From: Roman Gushchin +Date: Mon, 13 Jan 2020 16:29:16 -0800 +Subject: mm: memcg/slab: fix percpu slab vmstats flushing + +From: Roman Gushchin + +commit 4a87e2a25dc27131c3cce5e94421622193305638 upstream. + +Currently slab percpu vmstats are flushed twice: during the memcg +offlining and just before freeing the memcg structure. Each time percpu +counters are summed, added to the atomic counterparts and propagated up +by the cgroup tree. + +The second flushing is required due to how recursive vmstats are +implemented: counters are batched in percpu variables on a local level, +and once a percpu value is crossing some predefined threshold, it spills +over to atomic values on the local and each ascendant levels. It means +that without flushing some numbers cached in percpu variables will be +dropped on floor each time a cgroup is destroyed. And with uptime the +error on upper levels might become noticeable. + +The first flushing aims to make counters on ancestor levels more +precise. Dying cgroups may resume in the dying state for a long time. +After kmem_cache reparenting which is performed during the offlining +slab counters of the dying cgroup don't have any chances to be updated, +because any slab operations will be performed on the parent level. It +means that the inaccuracy caused by percpu batching will not decrease up +to the final destruction of the cgroup. By the original idea flushing +slab counters during the offlining should minimize the visible +inaccuracy of slab counters on the parent level. + +The problem is that percpu counters are not zeroed after the first +flushing. So every cached percpu value is summed twice. It creates a +small error (up to 32 pages per cpu, but usually less) which accumulates +on parent cgroup level. After creating and destroying of thousands of +child cgroups, slab counter on parent level can be way off the real +value. + +For now, let's just stop flushing slab counters on memcg offlining. It +can't be done correctly without scheduling a work on each cpu: reading +and zeroing it during css offlining can race with an asynchronous +update, which doesn't expect values to be changed underneath. + +With this change, slab counters on parent level will become eventually +consistent. Once all dying children are gone, values are correct. And +if not, the error is capped by 32 * NR_CPUS pages per dying cgroup. + +It's not perfect, as slab are reparented, so any updates after the +reparenting will happen on the parent level. It means that if a slab +page was allocated, a counter on child level was bumped, then the page +was reparented and freed, the annihilation of positive and negative +counter values will not happen until the child cgroup is released. It +makes slab counters different from others, and it might want us to +implement flushing in a correct form again. But it's also a question of +performance: scheduling a work on each cpu isn't free, and it's an open +question if the benefit of having more accurate counters is worth it. + +We might also consider flushing all counters on offlining, not only slab +counters. + +So let's fix the main problem now: make the slab counters eventually +consistent, so at least the error won't grow with uptime (or more +precisely the number of created and destroyed cgroups). And think about +the accuracy of counters separately. + +Link: http://lkml.kernel.org/r/20191220042728.1045881-1-guro@fb.com +Fixes: bee07b33db78 ("mm: memcontrol: flush percpu slab vmstats on kmem offlining") +Signed-off-by: Roman Gushchin +Acked-by: Johannes Weiner +Acked-by: Michal Hocko +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/mmzone.h | 5 ++--- + mm/memcontrol.c | 37 +++++++++---------------------------- + 2 files changed, 11 insertions(+), 31 deletions(-) + +--- a/include/linux/mmzone.h ++++ b/include/linux/mmzone.h +@@ -215,9 +215,8 @@ enum node_stat_item { + NR_INACTIVE_FILE, /* " " " " " */ + NR_ACTIVE_FILE, /* " " " " " */ + NR_UNEVICTABLE, /* " " " " " */ +- NR_SLAB_RECLAIMABLE, /* Please do not reorder this item */ +- NR_SLAB_UNRECLAIMABLE, /* and this one without looking at +- * memcg_flush_percpu_vmstats() first. */ ++ NR_SLAB_RECLAIMABLE, ++ NR_SLAB_UNRECLAIMABLE, + NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ + NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ + WORKINGSET_NODES, +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -3404,49 +3404,34 @@ static u64 mem_cgroup_read_u64(struct cg + } + } + +-static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only) ++static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg) + { +- unsigned long stat[MEMCG_NR_STAT]; ++ unsigned long stat[MEMCG_NR_STAT] = {0}; + struct mem_cgroup *mi; + int node, cpu, i; +- int min_idx, max_idx; +- +- if (slab_only) { +- min_idx = NR_SLAB_RECLAIMABLE; +- max_idx = NR_SLAB_UNRECLAIMABLE; +- } else { +- min_idx = 0; +- max_idx = MEMCG_NR_STAT; +- } +- +- for (i = min_idx; i < max_idx; i++) +- stat[i] = 0; + + for_each_online_cpu(cpu) +- for (i = min_idx; i < max_idx; i++) ++ for (i = 0; i < MEMCG_NR_STAT; i++) + stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu); + + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) +- for (i = min_idx; i < max_idx; i++) ++ for (i = 0; i < MEMCG_NR_STAT; i++) + atomic_long_add(stat[i], &mi->vmstats[i]); + +- if (!slab_only) +- max_idx = NR_VM_NODE_STAT_ITEMS; +- + for_each_node(node) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; + struct mem_cgroup_per_node *pi; + +- for (i = min_idx; i < max_idx; i++) ++ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + stat[i] = 0; + + for_each_online_cpu(cpu) +- for (i = min_idx; i < max_idx; i++) ++ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + stat[i] += per_cpu( + pn->lruvec_stat_cpu->count[i], cpu); + + for (pi = pn; pi; pi = parent_nodeinfo(pi, node)) +- for (i = min_idx; i < max_idx; i++) ++ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + atomic_long_add(stat[i], &pi->lruvec_stat[i]); + } + } +@@ -3520,13 +3505,9 @@ static void memcg_offline_kmem(struct me + parent = root_mem_cgroup; + + /* +- * Deactivate and reparent kmem_caches. Then flush percpu +- * slab statistics to have precise values at the parent and +- * all ancestor levels. It's required to keep slab stats +- * accurate after the reparenting of kmem_caches. ++ * Deactivate and reparent kmem_caches. + */ + memcg_deactivate_kmem_caches(memcg, parent); +- memcg_flush_percpu_vmstats(memcg, true); + + kmemcg_id = memcg->kmemcg_id; + BUG_ON(kmemcg_id < 0); +@@ -5037,7 +5018,7 @@ static void mem_cgroup_free(struct mem_c + * Flush percpu vmstats and vmevents to guarantee the value correctness + * on parent's and all ancestor levels. + */ +- memcg_flush_percpu_vmstats(memcg, false); ++ memcg_flush_percpu_vmstats(memcg); + memcg_flush_percpu_vmevents(memcg); + __mem_cgroup_free(memcg); + } diff --git a/queue-5.4/mm-memory_hotplug-don-t-free-usage-map-when-removing-a-re-added-early-section.patch b/queue-5.4/mm-memory_hotplug-don-t-free-usage-map-when-removing-a-re-added-early-section.patch new file mode 100644 index 00000000000..621679f3dc9 --- /dev/null +++ b/queue-5.4/mm-memory_hotplug-don-t-free-usage-map-when-removing-a-re-added-early-section.patch @@ -0,0 +1,97 @@ +From 8068df3b60373c390198f660574ea14c8098de57 Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Mon, 13 Jan 2020 16:29:07 -0800 +Subject: mm/memory_hotplug: don't free usage map when removing a re-added early section + +From: David Hildenbrand + +commit 8068df3b60373c390198f660574ea14c8098de57 upstream. + +When we remove an early section, we don't free the usage map, as the +usage maps of other sections are placed into the same page. Once the +section is removed, it is no longer an early section (especially, the +memmap is freed). When we re-add that section, the usage map is reused, +however, it is no longer an early section. When removing that section +again, we try to kfree() a usage map that was allocated during early +boot - bad. + +Let's check against PageReserved() to see if we are dealing with an +usage map that was allocated during boot. We could also check against +!(PageSlab(usage_page) || PageCompound(usage_page)), but PageReserved() is +cleaner. + +Can be triggered using memtrace under ppc64/powernv: + + $ mount -t debugfs none /sys/kernel/debug/ + $ echo 0x20000000 > /sys/kernel/debug/powerpc/memtrace/enable + $ echo 0x20000000 > /sys/kernel/debug/powerpc/memtrace/enable + ------------[ cut here ]------------ + kernel BUG at mm/slub.c:3969! + Oops: Exception in kernel mode, sig: 5 [#1] + LE PAGE_SIZE=3D64K MMU=3DHash SMP NR_CPUS=3D2048 NUMA PowerNV + Modules linked in: + CPU: 0 PID: 154 Comm: sh Not tainted 5.5.0-rc2-next-20191216-00005-g0be1dba7b7c0 #61 + NIP kfree+0x338/0x3b0 + LR section_deactivate+0x138/0x200 + Call Trace: + section_deactivate+0x138/0x200 + __remove_pages+0x114/0x150 + arch_remove_memory+0x3c/0x160 + try_remove_memory+0x114/0x1a0 + __remove_memory+0x20/0x40 + memtrace_enable_set+0x254/0x850 + simple_attr_write+0x138/0x160 + full_proxy_write+0x8c/0x110 + __vfs_write+0x38/0x70 + vfs_write+0x11c/0x2a0 + ksys_write+0x84/0x140 + system_call+0x5c/0x68 + ---[ end trace 4b053cbd84e0db62 ]--- + +The first invocation will offline+remove memory blocks. The second +invocation will first add+online them again, in order to offline+remove +them again (usually we are lucky and the exact same memory blocks will +get "reallocated"). + +Tested on powernv with boot memory: The usage map will not get freed. +Tested on x86-64 with DIMMs: The usage map will get freed. + +Using Dynamic Memory under a Power DLAPR can trigger it easily. + +Triggering removal (I assume after previously removed+re-added) of +memory from the HMC GUI can crash the kernel with the same call trace +and is fixed by this patch. + +Link: http://lkml.kernel.org/r/20191217104637.5509-1-david@redhat.com +Fixes: 326e1b8f83a4 ("mm/sparsemem: introduce a SECTION_IS_EARLY flag") +Signed-off-by: David Hildenbrand +Tested-by: Pingfan Liu +Cc: Dan Williams +Cc: Oscar Salvador +Cc: Michal Hocko +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/sparse.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +--- a/mm/sparse.c ++++ b/mm/sparse.c +@@ -775,7 +775,14 @@ static void section_deactivate(unsigned + if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) { + unsigned long section_nr = pfn_to_section_nr(pfn); + +- if (!section_is_early) { ++ /* ++ * When removing an early section, the usage map is kept (as the ++ * usage maps of other sections fall into the same page). It ++ * will be re-used when re-adding the section - which is then no ++ * longer an early section. If the usage map is PageReserved, it ++ * was allocated during boot. ++ */ ++ if (!PageReserved(virt_to_page(ms->usage))) { + kfree(ms->usage); + ms->usage = NULL; + } diff --git a/queue-5.4/mm-page-writeback.c-avoid-potential-division-by-zero-in-wb_min_max_ratio.patch b/queue-5.4/mm-page-writeback.c-avoid-potential-division-by-zero-in-wb_min_max_ratio.patch new file mode 100644 index 00000000000..459b8e5019d --- /dev/null +++ b/queue-5.4/mm-page-writeback.c-avoid-potential-division-by-zero-in-wb_min_max_ratio.patch @@ -0,0 +1,79 @@ +From 6d9e8c651dd979aa666bee15f086745f3ea9c4b3 Mon Sep 17 00:00:00 2001 +From: Wen Yang +Date: Mon, 13 Jan 2020 16:29:23 -0800 +Subject: mm/page-writeback.c: avoid potential division by zero in wb_min_max_ratio() + +From: Wen Yang + +commit 6d9e8c651dd979aa666bee15f086745f3ea9c4b3 upstream. + +Patch series "use div64_ul() instead of div_u64() if the divisor is +unsigned long". + +We were first inspired by commit b0ab99e7736a ("sched: Fix possible divide +by zero in avg_atom () calculation"), then refer to the recently analyzed +mm code, we found this suspicious place. + + 201 if (min) { + 202 min *= this_bw; + 203 do_div(min, tot_bw); + 204 } + +And we also disassembled and confirmed it: + + /usr/src/debug/kernel-4.9.168-016.ali3000/linux-4.9.168-016.ali3000.alios7.x86_64/mm/page-writeback.c: 201 + 0xffffffff811c37da <__wb_calc_thresh+234>: xor %r10d,%r10d + 0xffffffff811c37dd <__wb_calc_thresh+237>: test %rax,%rax + 0xffffffff811c37e0 <__wb_calc_thresh+240>: je 0xffffffff811c3800 <__wb_calc_thresh+272> + /usr/src/debug/kernel-4.9.168-016.ali3000/linux-4.9.168-016.ali3000.alios7.x86_64/mm/page-writeback.c: 202 + 0xffffffff811c37e2 <__wb_calc_thresh+242>: imul %r8,%rax + /usr/src/debug/kernel-4.9.168-016.ali3000/linux-4.9.168-016.ali3000.alios7.x86_64/mm/page-writeback.c: 203 + 0xffffffff811c37e6 <__wb_calc_thresh+246>: mov %r9d,%r10d ---> truncates it to 32 bits here + 0xffffffff811c37e9 <__wb_calc_thresh+249>: xor %edx,%edx + 0xffffffff811c37eb <__wb_calc_thresh+251>: div %r10 + 0xffffffff811c37ee <__wb_calc_thresh+254>: imul %rbx,%rax + 0xffffffff811c37f2 <__wb_calc_thresh+258>: shr $0x2,%rax + 0xffffffff811c37f6 <__wb_calc_thresh+262>: mul %rcx + 0xffffffff811c37f9 <__wb_calc_thresh+265>: shr $0x2,%rdx + 0xffffffff811c37fd <__wb_calc_thresh+269>: mov %rdx,%r10 + +This series uses div64_ul() instead of div_u64() if the divisor is +unsigned long, to avoid truncation to 32-bit on 64-bit platforms. + +This patch (of 3): + +The variables 'min' and 'max' are unsigned long and do_div truncates +them to 32 bits, which means it can test non-zero and be truncated to +zero for division. Fix this issue by using div64_ul() instead. + +Link: http://lkml.kernel.org/r/20200102081442.8273-2-wenyang@linux.alibaba.com +Fixes: 693108a8a667 ("writeback: make bdi->min/max_ratio handling cgroup writeback aware") +Signed-off-by: Wen Yang +Reviewed-by: Andrew Morton +Cc: Qian Cai +Cc: Tejun Heo +Cc: Jens Axboe +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/page-writeback.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -201,11 +201,11 @@ static void wb_min_max_ratio(struct bdi_ + if (this_bw < tot_bw) { + if (min) { + min *= this_bw; +- do_div(min, tot_bw); ++ min = div64_ul(min, tot_bw); + } + if (max < 100) { + max *= this_bw; +- do_div(max, tot_bw); ++ max = div64_ul(max, tot_bw); + } + } + diff --git a/queue-5.4/mm-shmem.c-thp-shmem-fix-conflict-of-above-47bit-hint-address-and-pmd-alignment.patch b/queue-5.4/mm-shmem.c-thp-shmem-fix-conflict-of-above-47bit-hint-address-and-pmd-alignment.patch new file mode 100644 index 00000000000..d788d4f2d2f --- /dev/null +++ b/queue-5.4/mm-shmem.c-thp-shmem-fix-conflict-of-above-47bit-hint-address-and-pmd-alignment.patch @@ -0,0 +1,74 @@ +From 991589974d9c9ecb24ee3799ec8c415c730598a2 Mon Sep 17 00:00:00 2001 +From: "Kirill A. Shutemov" +Date: Mon, 13 Jan 2020 16:29:13 -0800 +Subject: mm/shmem.c: thp, shmem: fix conflict of above-47bit hint address and PMD alignment + +From: Kirill A. Shutemov + +commit 991589974d9c9ecb24ee3799ec8c415c730598a2 upstream. + +Shmem/tmpfs tries to provide THP-friendly mappings if huge pages are +enabled. But it doesn't work well with above-47bit hint address. + +Normally, the kernel doesn't create userspace mappings above 47-bit, +even if the machine allows this (such as with 5-level paging on x86-64). +Not all user space is ready to handle wide addresses. It's known that +at least some JIT compilers use higher bits in pointers to encode their +information. + +Userspace can ask for allocation from full address space by specifying +hint address (with or without MAP_FIXED) above 47-bits. If the +application doesn't need a particular address, but wants to allocate +from whole address space it can specify -1 as a hint address. + +Unfortunately, this trick breaks THP alignment in shmem/tmp: +shmem_get_unmapped_area() would not try to allocate PMD-aligned area if +*any* hint address specified. + +This can be fixed by requesting the aligned area if the we failed to +allocated at user-specified hint address. The request with inflated +length will also take the user-specified hint address. This way we will +not lose an allocation request from the full address space. + +[kirill@shutemov.name: fold in a fixup] + Link: http://lkml.kernel.org/r/20191223231309.t6bh5hkbmokihpfu@box +Link: http://lkml.kernel.org/r/20191220142548.7118-3-kirill.shutemov@linux.intel.com +Fixes: b569bab78d8d ("x86/mm: Prepare to expose larger address space to userspace") +Signed-off-by: Kirill A. Shutemov +Cc: "Willhalm, Thomas" +Cc: Dan Williams +Cc: "Bruggeman, Otto G" +Cc: "Aneesh Kumar K . V" +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/shmem.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +--- a/mm/shmem.c ++++ b/mm/shmem.c +@@ -2106,9 +2106,10 @@ unsigned long shmem_get_unmapped_area(st + /* + * Our priority is to support MAP_SHARED mapped hugely; + * and support MAP_PRIVATE mapped hugely too, until it is COWed. +- * But if caller specified an address hint, respect that as before. ++ * But if caller specified an address hint and we allocated area there ++ * successfully, respect that as before. + */ +- if (uaddr) ++ if (uaddr == addr) + return addr; + + if (shmem_huge != SHMEM_HUGE_FORCE) { +@@ -2142,7 +2143,7 @@ unsigned long shmem_get_unmapped_area(st + if (inflated_len < len) + return addr; + +- inflated_addr = get_area(NULL, 0, inflated_len, 0, flags); ++ inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags); + if (IS_ERR_VALUE(inflated_addr)) + return addr; + if (inflated_addr & ~PAGE_MASK) diff --git a/queue-5.4/net-fix-kernel-doc-warning-in-linux-netdevice.h.patch b/queue-5.4/net-fix-kernel-doc-warning-in-linux-netdevice.h.patch new file mode 100644 index 00000000000..b292fd2b9f3 --- /dev/null +++ b/queue-5.4/net-fix-kernel-doc-warning-in-linux-netdevice.h.patch @@ -0,0 +1,34 @@ +From 1f26c0d3d24125992ab0026b0dab16c08df947c7 Mon Sep 17 00:00:00 2001 +From: Randy Dunlap +Date: Mon, 16 Dec 2019 18:52:45 -0800 +Subject: net: fix kernel-doc warning in + +From: Randy Dunlap + +commit 1f26c0d3d24125992ab0026b0dab16c08df947c7 upstream. + +Fix missing '*' kernel-doc notation that causes this warning: + +../include/linux/netdevice.h:1779: warning: bad line: spinlock + +Fixes: ab92d68fc22f ("net: core: add generic lockdep keys") +Signed-off-by: Randy Dunlap +Cc: Taehee Yoo +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/netdevice.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -1761,7 +1761,7 @@ enum netdev_priv_flags { + * for hardware timestamping + * @sfp_bus: attached &struct sfp_bus structure. + * @qdisc_tx_busylock_key: lockdep class annotating Qdisc->busylock +- spinlock ++ * spinlock + * @qdisc_running_key: lockdep class annotating Qdisc->running seqcount + * @qdisc_xmit_lock_key: lockdep class annotating + * netdev_queue->_xmit_lock spinlock diff --git a/queue-5.4/net-stmmac-16kb-buffer-must-be-16-byte-aligned.patch b/queue-5.4/net-stmmac-16kb-buffer-must-be-16-byte-aligned.patch new file mode 100644 index 00000000000..70c878d0450 --- /dev/null +++ b/queue-5.4/net-stmmac-16kb-buffer-must-be-16-byte-aligned.patch @@ -0,0 +1,34 @@ +From 8605131747e7e1fd8f6c9f97a00287aae2b2c640 Mon Sep 17 00:00:00 2001 +From: Jose Abreu +Date: Wed, 18 Dec 2019 11:17:41 +0100 +Subject: net: stmmac: 16KB buffer must be 16 byte aligned + +From: Jose Abreu + +commit 8605131747e7e1fd8f6c9f97a00287aae2b2c640 upstream. + +The 16KB RX Buffer must also be 16 byte aligned. Fix it. + +Fixes: 7ac6653a085b ("stmmac: Move the STMicroelectronics driver") +Signed-off-by: Jose Abreu +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/net/ethernet/stmicro/stmmac/common.h | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +--- a/drivers/net/ethernet/stmicro/stmmac/common.h ++++ b/drivers/net/ethernet/stmicro/stmmac/common.h +@@ -364,9 +364,8 @@ struct dma_features { + unsigned int arpoffsel; + }; + +-/* GMAC TX FIFO is 8K, Rx FIFO is 16K */ +-#define BUF_SIZE_16KiB 16384 +-/* RX Buffer size must be < 8191 and multiple of 4/8/16 bytes */ ++/* RX Buffer size must be multiple of 4/8/16 bytes */ ++#define BUF_SIZE_16KiB 16368 + #define BUF_SIZE_8KiB 8188 + #define BUF_SIZE_4KiB 4096 + #define BUF_SIZE_2KiB 2048 diff --git a/queue-5.4/net-stmmac-enable-16kb-buffer-size.patch b/queue-5.4/net-stmmac-enable-16kb-buffer-size.patch new file mode 100644 index 00000000000..c65ab711797 --- /dev/null +++ b/queue-5.4/net-stmmac-enable-16kb-buffer-size.patch @@ -0,0 +1,34 @@ +From b2f3a481c4cd62f78391b836b64c0a6e72b503d2 Mon Sep 17 00:00:00 2001 +From: Jose Abreu +Date: Wed, 18 Dec 2019 11:17:42 +0100 +Subject: net: stmmac: Enable 16KB buffer size + +From: Jose Abreu + +commit b2f3a481c4cd62f78391b836b64c0a6e72b503d2 upstream. + +XGMAC supports maximum MTU that can go to 16KB. Lets add this check in +the calculation of RX buffer size. + +Fixes: 7ac6653a085b ("stmmac: Move the STMicroelectronics driver") +Signed-off-by: Jose Abreu +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c ++++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +@@ -1108,7 +1108,9 @@ static int stmmac_set_bfsize(int mtu, in + { + int ret = bufsize; + +- if (mtu >= BUF_SIZE_4KiB) ++ if (mtu >= BUF_SIZE_8KiB) ++ ret = BUF_SIZE_16KiB; ++ else if (mtu >= BUF_SIZE_4KiB) + ret = BUF_SIZE_8KiB; + else if (mtu >= BUF_SIZE_2KiB) + ret = BUF_SIZE_4KiB; diff --git a/queue-5.4/reset-fix-of-devm-_reset_control_array_get-kerneldoc-return-types.patch b/queue-5.4/reset-fix-of-devm-_reset_control_array_get-kerneldoc-return-types.patch new file mode 100644 index 00000000000..1a9e9dfae2a --- /dev/null +++ b/queue-5.4/reset-fix-of-devm-_reset_control_array_get-kerneldoc-return-types.patch @@ -0,0 +1,46 @@ +From 723c0011c7f6992f57e2c629fa9c89141acc115f Mon Sep 17 00:00:00 2001 +From: Geert Uytterhoeven +Date: Wed, 20 Nov 2019 15:26:13 +0100 +Subject: reset: Fix {of,devm}_reset_control_array_get kerneldoc return types + +From: Geert Uytterhoeven + +commit 723c0011c7f6992f57e2c629fa9c89141acc115f upstream. + +of_reset_control_array_get() and devm_reset_control_array_get() return +struct reset_control pointers, not internal struct reset_control_array +pointers, just like all other reset control API calls. + +Correct the kerneldoc to match the code. + +Fixes: 17c82e206d2a3cd8 ("reset: Add APIs to manage array of resets") +Signed-off-by: Geert Uytterhoeven +Signed-off-by: Philipp Zabel +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/reset/core.c | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +--- a/drivers/reset/core.c ++++ b/drivers/reset/core.c +@@ -861,8 +861,7 @@ static int of_reset_control_get_count(st + * @acquired: only one reset control may be acquired for a given controller + * and ID + * +- * Returns pointer to allocated reset_control_array on success or +- * error on failure ++ * Returns pointer to allocated reset_control on success or error on failure + */ + struct reset_control * + of_reset_control_array_get(struct device_node *np, bool shared, bool optional, +@@ -915,8 +914,7 @@ EXPORT_SYMBOL_GPL(of_reset_control_array + * that just have to be asserted or deasserted, without any + * requirements on the order. + * +- * Returns pointer to allocated reset_control_array on success or +- * error on failure ++ * Returns pointer to allocated reset_control on success or error on failure + */ + struct reset_control * + devm_reset_control_array_get(struct device *dev, bool shared, bool optional) diff --git a/queue-5.4/series b/queue-5.4/series index d1378be9b41..65366b23e29 100644 --- a/queue-5.4/series +++ b/queue-5.4/series @@ -75,3 +75,30 @@ clk-samsung-exynos5420-keep-top-g3d-clocks-enabled.patch perf-hists-fix-variable-name-s-inconsistency-in-hists__for_each-macro.patch locking-lockdep-fix-buffer-overrun-problem-in-stack_trace.patch perf-report-fix-incorrectly-added-dimensions-as-switch-perf-data-file.patch +mm-shmem.c-thp-shmem-fix-conflict-of-above-47bit-hint-address-and-pmd-alignment.patch +mm-huge_memory.c-thp-fix-conflict-of-above-47bit-hint-address-and-pmd-alignment.patch +mm-memcg-slab-fix-percpu-slab-vmstats-flushing.patch +mm-memcg-slab-call-flush_memcg_workqueue-only-if-memcg-workqueue-is-valid.patch +mm-debug_pagealloc-don-t-rely-on-static-keys-too-early.patch +btrfs-fix-invalid-removal-of-root-ref.patch +btrfs-do-not-delete-mismatched-root-refs.patch +btrfs-relocation-fix-reloc_root-lifespan-and-access.patch +btrfs-fix-memory-leak-in-qgroup-accounting.patch +btrfs-check-rw_devices-not-num_devices-for-balance.patch +btrfs-always-copy-scrub-arguments-back-to-user-space.patch +mm-memory_hotplug-don-t-free-usage-map-when-removing-a-re-added-early-section.patch +mm-page-writeback.c-avoid-potential-division-by-zero-in-wb_min_max_ratio.patch +mm-khugepaged-add-trace-status-description-for-scan_page_has_private.patch +arm-dts-imx6qdl-sabresd-remove-incorrect-power-supply-assignment.patch +arm-dts-imx6sx-sdb-remove-incorrect-power-supply-assignment.patch +arm-dts-imx6sl-evk-remove-incorrect-power-supply-assignment.patch +arm-dts-imx6sll-evk-remove-incorrect-power-supply-assignment.patch +arm-dts-imx6q-icore-mipi-use-1.5-version-of-i.core-mx6dl.patch +arm-dts-imx7-fix-toradex-colibri-imx7s-256mb-nand-flash-support.patch +net-stmmac-16kb-buffer-must-be-16-byte-aligned.patch +net-stmmac-enable-16kb-buffer-size.patch +reset-fix-of-devm-_reset_control_array_get-kerneldoc-return-types.patch +tipc-fix-potential-hanging-after-b-rcast-changing.patch +tipc-fix-retrans-failure-due-to-wrong-destination.patch +net-fix-kernel-doc-warning-in-linux-netdevice.h.patch +block-fix-the-type-of-sts-in-bsg_queue_rq.patch diff --git a/queue-5.4/tipc-fix-potential-hanging-after-b-rcast-changing.patch b/queue-5.4/tipc-fix-potential-hanging-after-b-rcast-changing.patch new file mode 100644 index 00000000000..4d24c7d4976 --- /dev/null +++ b/queue-5.4/tipc-fix-potential-hanging-after-b-rcast-changing.patch @@ -0,0 +1,105 @@ +From dca4a17d24ee9d878836ce5eb8dc25be1ffa5729 Mon Sep 17 00:00:00 2001 +From: Tuong Lien +Date: Tue, 10 Dec 2019 15:21:03 +0700 +Subject: tipc: fix potential hanging after b/rcast changing + +From: Tuong Lien + +commit dca4a17d24ee9d878836ce5eb8dc25be1ffa5729 upstream. + +In commit c55c8edafa91 ("tipc: smooth change between replicast and +broadcast"), we allow instant switching between replicast and broadcast +by sending a dummy 'SYN' packet on the last used link to synchronize +packets on the links. The 'SYN' message is an object of link congestion +also, so if that happens, a 'SOCK_WAKEUP' will be scheduled to be sent +back to the socket... +However, in that commit, we simply use the same socket 'cong_link_cnt' +counter for both the 'SYN' & normal payload message sending. Therefore, +if both the replicast & broadcast links are congested, the counter will +be not updated correctly but overwritten by the latter congestion. +Later on, when the 'SOCK_WAKEUP' messages are processed, the counter is +reduced one by one and eventually overflowed. Consequently, further +activities on the socket will only wait for the false congestion signal +to disappear but never been met. + +Because sending the 'SYN' message is vital for the mechanism, it should +be done anyway. This commit fixes the issue by marking the message with +an error code e.g. 'TIPC_ERR_NO_PORT', so its sending should not face a +link congestion, there is no need to touch the socket 'cong_link_cnt' +either. In addition, in the event of any error (e.g. -ENOBUFS), we will +purge the entire payload message queue and make a return immediately. + +Fixes: c55c8edafa91 ("tipc: smooth change between replicast and broadcast") +Acked-by: Jon Maloy +Signed-off-by: Tuong Lien +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman + +--- + net/tipc/bcast.c | 24 +++++++++++++++--------- + 1 file changed, 15 insertions(+), 9 deletions(-) + +--- a/net/tipc/bcast.c ++++ b/net/tipc/bcast.c +@@ -305,17 +305,17 @@ static int tipc_rcast_xmit(struct net *n + * @skb: socket buffer to copy + * @method: send method to be used + * @dests: destination nodes for message. +- * @cong_link_cnt: returns number of encountered congested destination links + * Returns 0 if success, otherwise errno + */ + static int tipc_mcast_send_sync(struct net *net, struct sk_buff *skb, + struct tipc_mc_method *method, +- struct tipc_nlist *dests, +- u16 *cong_link_cnt) ++ struct tipc_nlist *dests) + { + struct tipc_msg *hdr, *_hdr; + struct sk_buff_head tmpq; + struct sk_buff *_skb; ++ u16 cong_link_cnt; ++ int rc = 0; + + /* Is a cluster supporting with new capabilities ? */ + if (!(tipc_net(net)->capabilities & TIPC_MCAST_RBCTL)) +@@ -343,18 +343,19 @@ static int tipc_mcast_send_sync(struct n + _hdr = buf_msg(_skb); + msg_set_size(_hdr, MCAST_H_SIZE); + msg_set_is_rcast(_hdr, !msg_is_rcast(hdr)); ++ msg_set_errcode(_hdr, TIPC_ERR_NO_PORT); + + __skb_queue_head_init(&tmpq); + __skb_queue_tail(&tmpq, _skb); + if (method->rcast) +- tipc_bcast_xmit(net, &tmpq, cong_link_cnt); ++ rc = tipc_bcast_xmit(net, &tmpq, &cong_link_cnt); + else +- tipc_rcast_xmit(net, &tmpq, dests, cong_link_cnt); ++ rc = tipc_rcast_xmit(net, &tmpq, dests, &cong_link_cnt); + + /* This queue should normally be empty by now */ + __skb_queue_purge(&tmpq); + +- return 0; ++ return rc; + } + + /* tipc_mcast_xmit - deliver message to indicated destination nodes +@@ -396,9 +397,14 @@ int tipc_mcast_xmit(struct net *net, str + msg_set_is_rcast(hdr, method->rcast); + + /* Switch method ? */ +- if (rcast != method->rcast) +- tipc_mcast_send_sync(net, skb, method, +- dests, cong_link_cnt); ++ if (rcast != method->rcast) { ++ rc = tipc_mcast_send_sync(net, skb, method, dests); ++ if (unlikely(rc)) { ++ pr_err("Unable to send SYN: method %d, rc %d\n", ++ rcast, rc); ++ goto exit; ++ } ++ } + + if (method->rcast) + rc = tipc_rcast_xmit(net, pkts, dests, cong_link_cnt); diff --git a/queue-5.4/tipc-fix-retrans-failure-due-to-wrong-destination.patch b/queue-5.4/tipc-fix-retrans-failure-due-to-wrong-destination.patch new file mode 100644 index 00000000000..4da666beee6 --- /dev/null +++ b/queue-5.4/tipc-fix-retrans-failure-due-to-wrong-destination.patch @@ -0,0 +1,100 @@ +From abc9b4e0549b93fdaff56e9532bc49a2d7b04955 Mon Sep 17 00:00:00 2001 +From: Tuong Lien +Date: Tue, 10 Dec 2019 15:21:04 +0700 +Subject: tipc: fix retrans failure due to wrong destination + +From: Tuong Lien + +commit abc9b4e0549b93fdaff56e9532bc49a2d7b04955 upstream. + +When a user message is sent, TIPC will check if the socket has faced a +congestion at link layer. If that happens, it will make a sleep to wait +for the congestion to disappear. This leaves a gap for other users to +take over the socket (e.g. multi threads) since the socket is released +as well. Also, in case of connectionless (e.g. SOCK_RDM), user is free +to send messages to various destinations (e.g. via 'sendto()'), then +the socket's preformatted header has to be updated correspondingly +prior to the actual payload message building. + +Unfortunately, the latter action is done before the first action which +causes a condition issue that the destination of a certain message can +be modified incorrectly in the middle, leading to wrong destination +when that message is built. Consequently, when the message is sent to +the link layer, it gets stuck there forever because the peer node will +simply reject it. After a number of retransmission attempts, the link +is eventually taken down and the retransmission failure is reported. + +This commit fixes the problem by rearranging the order of actions to +prevent the race condition from occurring, so the message building is +'atomic' and its header will not be modified by anyone. + +Fixes: 365ad353c256 ("tipc: reduce risk of user starvation during link congestion") +Acked-by: Jon Maloy +Signed-off-by: Tuong Lien +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman + +--- + net/tipc/socket.c | 32 ++++++++++++++++++-------------- + 1 file changed, 18 insertions(+), 14 deletions(-) + +--- a/net/tipc/socket.c ++++ b/net/tipc/socket.c +@@ -1306,8 +1306,8 @@ static int __tipc_sendmsg(struct socket + struct tipc_msg *hdr = &tsk->phdr; + struct tipc_name_seq *seq; + struct sk_buff_head pkts; +- u32 dport, dnode = 0; +- u32 type, inst; ++ u32 dport = 0, dnode = 0; ++ u32 type = 0, inst = 0; + int mtu, rc; + + if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE)) +@@ -1360,23 +1360,11 @@ static int __tipc_sendmsg(struct socket + type = dest->addr.name.name.type; + inst = dest->addr.name.name.instance; + dnode = dest->addr.name.domain; +- msg_set_type(hdr, TIPC_NAMED_MSG); +- msg_set_hdr_sz(hdr, NAMED_H_SIZE); +- msg_set_nametype(hdr, type); +- msg_set_nameinst(hdr, inst); +- msg_set_lookup_scope(hdr, tipc_node2scope(dnode)); + dport = tipc_nametbl_translate(net, type, inst, &dnode); +- msg_set_destnode(hdr, dnode); +- msg_set_destport(hdr, dport); + if (unlikely(!dport && !dnode)) + return -EHOSTUNREACH; + } else if (dest->addrtype == TIPC_ADDR_ID) { + dnode = dest->addr.id.node; +- msg_set_type(hdr, TIPC_DIRECT_MSG); +- msg_set_lookup_scope(hdr, 0); +- msg_set_destnode(hdr, dnode); +- msg_set_destport(hdr, dest->addr.id.ref); +- msg_set_hdr_sz(hdr, BASIC_H_SIZE); + } else { + return -EINVAL; + } +@@ -1387,6 +1375,22 @@ static int __tipc_sendmsg(struct socket + if (unlikely(rc)) + return rc; + ++ if (dest->addrtype == TIPC_ADDR_NAME) { ++ msg_set_type(hdr, TIPC_NAMED_MSG); ++ msg_set_hdr_sz(hdr, NAMED_H_SIZE); ++ msg_set_nametype(hdr, type); ++ msg_set_nameinst(hdr, inst); ++ msg_set_lookup_scope(hdr, tipc_node2scope(dnode)); ++ msg_set_destnode(hdr, dnode); ++ msg_set_destport(hdr, dport); ++ } else { /* TIPC_ADDR_ID */ ++ msg_set_type(hdr, TIPC_DIRECT_MSG); ++ msg_set_lookup_scope(hdr, 0); ++ msg_set_destnode(hdr, dnode); ++ msg_set_destport(hdr, dest->addr.id.ref); ++ msg_set_hdr_sz(hdr, BASIC_H_SIZE); ++ } ++ + __skb_queue_head_init(&pkts); + mtu = tipc_node_get_mtu(net, dnode, tsk->portid); + rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); -- 2.47.3