]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.4-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 19 Jan 2020 15:42:21 +0000 (16:42 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 19 Jan 2020 15:42:21 +0000 (16:42 +0100)
added patches:
arm-dts-imx6q-icore-mipi-use-1.5-version-of-i.core-mx6dl.patch
arm-dts-imx6qdl-sabresd-remove-incorrect-power-supply-assignment.patch
arm-dts-imx6sl-evk-remove-incorrect-power-supply-assignment.patch
arm-dts-imx6sll-evk-remove-incorrect-power-supply-assignment.patch
arm-dts-imx6sx-sdb-remove-incorrect-power-supply-assignment.patch
arm-dts-imx7-fix-toradex-colibri-imx7s-256mb-nand-flash-support.patch
block-fix-the-type-of-sts-in-bsg_queue_rq.patch
btrfs-always-copy-scrub-arguments-back-to-user-space.patch
btrfs-check-rw_devices-not-num_devices-for-balance.patch
btrfs-do-not-delete-mismatched-root-refs.patch
btrfs-fix-invalid-removal-of-root-ref.patch
btrfs-fix-memory-leak-in-qgroup-accounting.patch
btrfs-relocation-fix-reloc_root-lifespan-and-access.patch
mm-debug_pagealloc-don-t-rely-on-static-keys-too-early.patch
mm-huge_memory.c-thp-fix-conflict-of-above-47bit-hint-address-and-pmd-alignment.patch
mm-khugepaged-add-trace-status-description-for-scan_page_has_private.patch
mm-memcg-slab-call-flush_memcg_workqueue-only-if-memcg-workqueue-is-valid.patch
mm-memcg-slab-fix-percpu-slab-vmstats-flushing.patch
mm-memory_hotplug-don-t-free-usage-map-when-removing-a-re-added-early-section.patch
mm-page-writeback.c-avoid-potential-division-by-zero-in-wb_min_max_ratio.patch
mm-shmem.c-thp-shmem-fix-conflict-of-above-47bit-hint-address-and-pmd-alignment.patch
net-fix-kernel-doc-warning-in-linux-netdevice.h.patch
net-stmmac-16kb-buffer-must-be-16-byte-aligned.patch
net-stmmac-enable-16kb-buffer-size.patch
reset-fix-of-devm-_reset_control_array_get-kerneldoc-return-types.patch
tipc-fix-potential-hanging-after-b-rcast-changing.patch
tipc-fix-retrans-failure-due-to-wrong-destination.patch

28 files changed:
queue-5.4/arm-dts-imx6q-icore-mipi-use-1.5-version-of-i.core-mx6dl.patch [new file with mode: 0644]
queue-5.4/arm-dts-imx6qdl-sabresd-remove-incorrect-power-supply-assignment.patch [new file with mode: 0644]
queue-5.4/arm-dts-imx6sl-evk-remove-incorrect-power-supply-assignment.patch [new file with mode: 0644]
queue-5.4/arm-dts-imx6sll-evk-remove-incorrect-power-supply-assignment.patch [new file with mode: 0644]
queue-5.4/arm-dts-imx6sx-sdb-remove-incorrect-power-supply-assignment.patch [new file with mode: 0644]
queue-5.4/arm-dts-imx7-fix-toradex-colibri-imx7s-256mb-nand-flash-support.patch [new file with mode: 0644]
queue-5.4/block-fix-the-type-of-sts-in-bsg_queue_rq.patch [new file with mode: 0644]
queue-5.4/btrfs-always-copy-scrub-arguments-back-to-user-space.patch [new file with mode: 0644]
queue-5.4/btrfs-check-rw_devices-not-num_devices-for-balance.patch [new file with mode: 0644]
queue-5.4/btrfs-do-not-delete-mismatched-root-refs.patch [new file with mode: 0644]
queue-5.4/btrfs-fix-invalid-removal-of-root-ref.patch [new file with mode: 0644]
queue-5.4/btrfs-fix-memory-leak-in-qgroup-accounting.patch [new file with mode: 0644]
queue-5.4/btrfs-relocation-fix-reloc_root-lifespan-and-access.patch [new file with mode: 0644]
queue-5.4/mm-debug_pagealloc-don-t-rely-on-static-keys-too-early.patch [new file with mode: 0644]
queue-5.4/mm-huge_memory.c-thp-fix-conflict-of-above-47bit-hint-address-and-pmd-alignment.patch [new file with mode: 0644]
queue-5.4/mm-khugepaged-add-trace-status-description-for-scan_page_has_private.patch [new file with mode: 0644]
queue-5.4/mm-memcg-slab-call-flush_memcg_workqueue-only-if-memcg-workqueue-is-valid.patch [new file with mode: 0644]
queue-5.4/mm-memcg-slab-fix-percpu-slab-vmstats-flushing.patch [new file with mode: 0644]
queue-5.4/mm-memory_hotplug-don-t-free-usage-map-when-removing-a-re-added-early-section.patch [new file with mode: 0644]
queue-5.4/mm-page-writeback.c-avoid-potential-division-by-zero-in-wb_min_max_ratio.patch [new file with mode: 0644]
queue-5.4/mm-shmem.c-thp-shmem-fix-conflict-of-above-47bit-hint-address-and-pmd-alignment.patch [new file with mode: 0644]
queue-5.4/net-fix-kernel-doc-warning-in-linux-netdevice.h.patch [new file with mode: 0644]
queue-5.4/net-stmmac-16kb-buffer-must-be-16-byte-aligned.patch [new file with mode: 0644]
queue-5.4/net-stmmac-enable-16kb-buffer-size.patch [new file with mode: 0644]
queue-5.4/reset-fix-of-devm-_reset_control_array_get-kerneldoc-return-types.patch [new file with mode: 0644]
queue-5.4/series
queue-5.4/tipc-fix-potential-hanging-after-b-rcast-changing.patch [new file with mode: 0644]
queue-5.4/tipc-fix-retrans-failure-due-to-wrong-destination.patch [new file with mode: 0644]

diff --git a/queue-5.4/arm-dts-imx6q-icore-mipi-use-1.5-version-of-i.core-mx6dl.patch b/queue-5.4/arm-dts-imx6q-icore-mipi-use-1.5-version-of-i.core-mx6dl.patch
new file mode 100644 (file)
index 0000000..5cfeef7
--- /dev/null
@@ -0,0 +1,45 @@
+From 4a132f60808ae3a751e107a373f8572012352d3c Mon Sep 17 00:00:00 2001
+From: Jagan Teki <jagan@amarulasolutions.com>
+Date: Mon, 30 Dec 2019 17:30:19 +0530
+Subject: ARM: dts: imx6q-icore-mipi: Use 1.5 version of i.Core MX6DL
+
+From: Jagan Teki <jagan@amarulasolutions.com>
+
+commit 4a132f60808ae3a751e107a373f8572012352d3c upstream.
+
+The EDIMM STARTER KIT i.Core 1.5 MIPI Evaluation is based on
+the 1.5 version of the i.Core MX6 cpu module. The 1.5 version
+differs from the original one for a few details, including the
+ethernet PHY interface clock provider.
+
+With this commit, the ethernet interface works properly:
+SMSC LAN8710/LAN8720 2188000.ethernet-1:00: attached PHY driver
+
+While before using the 1.5 version, ethernet failed to startup
+do to un-clocked PHY interface:
+fec 2188000.ethernet eth0: could not attach to PHY
+
+Similar fix has merged for i.Core MX6Q but missed to update for DL.
+
+Fixes: a8039f2dd089 ("ARM: dts: imx6dl: Add Engicam i.CoreM6 1.5 Quad/Dual MIPI starter kit support")
+Cc: Jacopo Mondi <jacopo@jmondi.org>
+Signed-off-by: Michael Trimarchi <michael@amarulasolutions.com>
+Signed-off-by: Jagan Teki <jagan@amarulasolutions.com>
+Signed-off-by: Shawn Guo <shawnguo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm/boot/dts/imx6dl-icore-mipi.dts |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/arm/boot/dts/imx6dl-icore-mipi.dts
++++ b/arch/arm/boot/dts/imx6dl-icore-mipi.dts
+@@ -8,7 +8,7 @@
+ /dts-v1/;
+ #include "imx6dl.dtsi"
+-#include "imx6qdl-icore.dtsi"
++#include "imx6qdl-icore-1.5.dtsi"
+ / {
+       model = "Engicam i.CoreM6 DualLite/Solo MIPI Starter Kit";
diff --git a/queue-5.4/arm-dts-imx6qdl-sabresd-remove-incorrect-power-supply-assignment.patch b/queue-5.4/arm-dts-imx6qdl-sabresd-remove-incorrect-power-supply-assignment.patch
new file mode 100644 (file)
index 0000000..e6a490a
--- /dev/null
@@ -0,0 +1,39 @@
+From 4521de30fbb3f5be0db58de93582ebce72c9d44f Mon Sep 17 00:00:00 2001
+From: Anson Huang <Anson.Huang@nxp.com>
+Date: Mon, 30 Dec 2019 09:41:07 +0800
+Subject: ARM: dts: imx6qdl-sabresd: Remove incorrect power supply assignment
+
+From: Anson Huang <Anson.Huang@nxp.com>
+
+commit 4521de30fbb3f5be0db58de93582ebce72c9d44f upstream.
+
+The vdd3p0 LDO's input should be from external USB VBUS directly, NOT
+PMIC's power supply, the vdd3p0 LDO's target output voltage can be
+controlled by SW, and it requires input voltage to be high enough, with
+incorrect power supply assigned, if the power supply's voltage is lower
+than the LDO target output voltage, it will return fail and skip the LDO
+voltage adjustment, so remove the power supply assignment for vdd3p0 to
+avoid such scenario.
+
+Fixes: 93385546ba36 ("ARM: dts: imx6qdl-sabresd: Assign corresponding power supply for LDOs")
+Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
+Signed-off-by: Shawn Guo <shawnguo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm/boot/dts/imx6qdl-sabresd.dtsi |    4 ----
+ 1 file changed, 4 deletions(-)
+
+--- a/arch/arm/boot/dts/imx6qdl-sabresd.dtsi
++++ b/arch/arm/boot/dts/imx6qdl-sabresd.dtsi
+@@ -749,10 +749,6 @@
+       vin-supply = <&vgen5_reg>;
+ };
+-&reg_vdd3p0 {
+-      vin-supply = <&sw2_reg>;
+-};
+-
+ &reg_vdd2p5 {
+       vin-supply = <&vgen5_reg>;
+ };
diff --git a/queue-5.4/arm-dts-imx6sl-evk-remove-incorrect-power-supply-assignment.patch b/queue-5.4/arm-dts-imx6sl-evk-remove-incorrect-power-supply-assignment.patch
new file mode 100644 (file)
index 0000000..6fcf5be
--- /dev/null
@@ -0,0 +1,39 @@
+From b4eb9ef0e29cd28c6fd684e0ab77bda824acb20e Mon Sep 17 00:00:00 2001
+From: Anson Huang <Anson.Huang@nxp.com>
+Date: Mon, 30 Dec 2019 09:41:09 +0800
+Subject: ARM: dts: imx6sl-evk: Remove incorrect power supply assignment
+
+From: Anson Huang <Anson.Huang@nxp.com>
+
+commit b4eb9ef0e29cd28c6fd684e0ab77bda824acb20e upstream.
+
+The vdd3p0 LDO's input should be from external USB VBUS directly, NOT
+PMIC's power supply, the vdd3p0 LDO's target output voltage can be
+controlled by SW, and it requires input voltage to be high enough, with
+incorrect power supply assigned, if the power supply's voltage is lower
+than the LDO target output voltage, it will return fail and skip the LDO
+voltage adjustment, so remove the power supply assignment for vdd3p0 to
+avoid such scenario.
+
+Fixes: 3feea8805d6f ("ARM: dts: imx6sl-evk: Assign corresponding power supply for LDOs")
+Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
+Signed-off-by: Shawn Guo <shawnguo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm/boot/dts/imx6sl-evk.dts |    4 ----
+ 1 file changed, 4 deletions(-)
+
+--- a/arch/arm/boot/dts/imx6sl-evk.dts
++++ b/arch/arm/boot/dts/imx6sl-evk.dts
+@@ -584,10 +584,6 @@
+       vin-supply = <&sw2_reg>;
+ };
+-&reg_vdd3p0 {
+-      vin-supply = <&sw2_reg>;
+-};
+-
+ &reg_vdd2p5 {
+       vin-supply = <&sw2_reg>;
+ };
diff --git a/queue-5.4/arm-dts-imx6sll-evk-remove-incorrect-power-supply-assignment.patch b/queue-5.4/arm-dts-imx6sll-evk-remove-incorrect-power-supply-assignment.patch
new file mode 100644 (file)
index 0000000..dbc84c7
--- /dev/null
@@ -0,0 +1,39 @@
+From 3479b2843c78ffb60247f522226ba68f93aee355 Mon Sep 17 00:00:00 2001
+From: Anson Huang <Anson.Huang@nxp.com>
+Date: Mon, 30 Dec 2019 09:41:10 +0800
+Subject: ARM: dts: imx6sll-evk: Remove incorrect power supply assignment
+
+From: Anson Huang <Anson.Huang@nxp.com>
+
+commit 3479b2843c78ffb60247f522226ba68f93aee355 upstream.
+
+The vdd3p0 LDO's input should be from external USB VBUS directly, NOT
+PMIC's power supply, the vdd3p0 LDO's target output voltage can be
+controlled by SW, and it requires input voltage to be high enough, with
+incorrect power supply assigned, if the power supply's voltage is lower
+than the LDO target output voltage, it will return fail and skip the LDO
+voltage adjustment, so remove the power supply assignment for vdd3p0 to
+avoid such scenario.
+
+Fixes: 96a9169cf621 ("ARM: dts: imx6sll-evk: Assign corresponding power supply for vdd3p0")
+Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
+Signed-off-by: Shawn Guo <shawnguo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm/boot/dts/imx6sll-evk.dts |    4 ----
+ 1 file changed, 4 deletions(-)
+
+--- a/arch/arm/boot/dts/imx6sll-evk.dts
++++ b/arch/arm/boot/dts/imx6sll-evk.dts
+@@ -265,10 +265,6 @@
+       status = "okay";
+ };
+-&reg_3p0 {
+-      vin-supply = <&sw2_reg>;
+-};
+-
+ &snvs_poweroff {
+       status = "okay";
+ };
diff --git a/queue-5.4/arm-dts-imx6sx-sdb-remove-incorrect-power-supply-assignment.patch b/queue-5.4/arm-dts-imx6sx-sdb-remove-incorrect-power-supply-assignment.patch
new file mode 100644 (file)
index 0000000..94fab0e
--- /dev/null
@@ -0,0 +1,53 @@
+From d4918ebb5c256d26696a13e78ac68c146111191a Mon Sep 17 00:00:00 2001
+From: Anson Huang <Anson.Huang@nxp.com>
+Date: Mon, 30 Dec 2019 09:41:08 +0800
+Subject: ARM: dts: imx6sx-sdb: Remove incorrect power supply assignment
+
+From: Anson Huang <Anson.Huang@nxp.com>
+
+commit d4918ebb5c256d26696a13e78ac68c146111191a upstream.
+
+The vdd3p0 LDO's input should be from external USB VBUS directly, NOT
+PMIC's power supply, the vdd3p0 LDO's target output voltage can be
+controlled by SW, and it requires input voltage to be high enough, with
+incorrect power supply assigned, if the power supply's voltage is lower
+than the LDO target output voltage, it will return fail and skip the LDO
+voltage adjustment, so remove the power supply assignment for vdd3p0 to
+avoid such scenario.
+
+Fixes: 37a4bdead109 ("ARM: dts: imx6sx-sdb: Assign corresponding power supply for LDOs")
+Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
+Signed-off-by: Shawn Guo <shawnguo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm/boot/dts/imx6sx-sdb-reva.dts |    4 ----
+ arch/arm/boot/dts/imx6sx-sdb.dts      |    4 ----
+ 2 files changed, 8 deletions(-)
+
+--- a/arch/arm/boot/dts/imx6sx-sdb-reva.dts
++++ b/arch/arm/boot/dts/imx6sx-sdb-reva.dts
+@@ -159,10 +159,6 @@
+       vin-supply = <&vgen6_reg>;
+ };
+-&reg_vdd3p0 {
+-      vin-supply = <&sw2_reg>;
+-};
+-
+ &reg_vdd2p5 {
+       vin-supply = <&vgen6_reg>;
+ };
+--- a/arch/arm/boot/dts/imx6sx-sdb.dts
++++ b/arch/arm/boot/dts/imx6sx-sdb.dts
+@@ -141,10 +141,6 @@
+       vin-supply = <&vgen6_reg>;
+ };
+-&reg_vdd3p0 {
+-      vin-supply = <&sw2_reg>;
+-};
+-
+ &reg_vdd2p5 {
+       vin-supply = <&vgen6_reg>;
+ };
diff --git a/queue-5.4/arm-dts-imx7-fix-toradex-colibri-imx7s-256mb-nand-flash-support.patch b/queue-5.4/arm-dts-imx7-fix-toradex-colibri-imx7s-256mb-nand-flash-support.patch
new file mode 100644 (file)
index 0000000..9c33926
--- /dev/null
@@ -0,0 +1,31 @@
+From 4b0b97e651ecf29f20248420b52b6864fbd40bc2 Mon Sep 17 00:00:00 2001
+From: Marcel Ziswiler <marcel.ziswiler@toradex.com>
+Date: Wed, 8 Jan 2020 17:12:31 +0100
+Subject: ARM: dts: imx7: Fix Toradex Colibri iMX7S 256MB NAND flash support
+
+From: Marcel Ziswiler <marcel.ziswiler@toradex.com>
+
+commit 4b0b97e651ecf29f20248420b52b6864fbd40bc2 upstream.
+
+Turns out when introducing the eMMC version the gpmi node required for
+NAND flash support got enabled exclusively on Colibri iMX7D 512MB.
+
+Fixes: f928a4a377e4 ("ARM: dts: imx7: add Toradex Colibri iMX7D 1GB (eMMC) support")
+Signed-off-by: Marcel Ziswiler <marcel.ziswiler@toradex.com>
+Signed-off-by: Shawn Guo <shawnguo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm/boot/dts/imx7s-colibri.dtsi |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/arch/arm/boot/dts/imx7s-colibri.dtsi
++++ b/arch/arm/boot/dts/imx7s-colibri.dtsi
+@@ -49,3 +49,7 @@
+               reg = <0x80000000 0x10000000>;
+       };
+ };
++
++&gpmi {
++      status = "okay";
++};
diff --git a/queue-5.4/block-fix-the-type-of-sts-in-bsg_queue_rq.patch b/queue-5.4/block-fix-the-type-of-sts-in-bsg_queue_rq.patch
new file mode 100644 (file)
index 0000000..cff1667
--- /dev/null
@@ -0,0 +1,39 @@
+From c44a4edb20938c85b64a256661443039f5bffdea Mon Sep 17 00:00:00 2001
+From: Bart Van Assche <bvanassche@acm.org>
+Date: Tue, 17 Dec 2019 16:23:29 -0800
+Subject: block: Fix the type of 'sts' in bsg_queue_rq()
+
+From: Bart Van Assche <bvanassche@acm.org>
+
+commit c44a4edb20938c85b64a256661443039f5bffdea upstream.
+
+This patch fixes the following sparse warnings:
+
+block/bsg-lib.c:269:19: warning: incorrect type in initializer (different base types)
+block/bsg-lib.c:269:19:    expected int sts
+block/bsg-lib.c:269:19:    got restricted blk_status_t [usertype]
+block/bsg-lib.c:286:16: warning: incorrect type in return expression (different base types)
+block/bsg-lib.c:286:16:    expected restricted blk_status_t
+block/bsg-lib.c:286:16:    got int [assigned] sts
+
+Cc: Martin Wilck <mwilck@suse.com>
+Fixes: d46fe2cb2dce ("block: drop device references in bsg_queue_rq()")
+Signed-off-by: Bart Van Assche <bvanassche@acm.org>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ block/bsg-lib.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/block/bsg-lib.c
++++ b/block/bsg-lib.c
+@@ -266,7 +266,7 @@ static blk_status_t bsg_queue_rq(struct
+       struct request *req = bd->rq;
+       struct bsg_set *bset =
+               container_of(q->tag_set, struct bsg_set, tag_set);
+-      int sts = BLK_STS_IOERR;
++      blk_status_t sts = BLK_STS_IOERR;
+       int ret;
+       blk_mq_start_request(req);
diff --git a/queue-5.4/btrfs-always-copy-scrub-arguments-back-to-user-space.patch b/queue-5.4/btrfs-always-copy-scrub-arguments-back-to-user-space.patch
new file mode 100644 (file)
index 0000000..64612c6
--- /dev/null
@@ -0,0 +1,64 @@
+From 5afe6ce748c1ea99e0d648153c05075e1ab93afb Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Thu, 16 Jan 2020 11:29:20 +0000
+Subject: Btrfs: always copy scrub arguments back to user space
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 5afe6ce748c1ea99e0d648153c05075e1ab93afb upstream.
+
+If scrub returns an error we are not copying back the scrub arguments
+structure to user space. This prevents user space to know how much
+progress scrub has done if an error happened - this includes -ECANCELED
+which is returned when users ask for scrub to stop. A particular use
+case, which is used in btrfs-progs, is to resume scrub after it is
+canceled, in that case it relies on checking the progress from the scrub
+arguments structure and then use that progress in a call to resume
+scrub.
+
+So fix this by always copying the scrub arguments structure to user
+space, overwriting the value returned to user space with -EFAULT only if
+copying the structure failed to let user space know that either that
+copying did not happen, and therefore the structure is stale, or it
+happened partially and the structure is probably not valid and corrupt
+due to the partial copy.
+
+Reported-by: Graham Cobb <g.btrfs@cobb.uk.net>
+Link: https://lore.kernel.org/linux-btrfs/d0a97688-78be-08de-ca7d-bcb4c7fb397e@cobb.uk.net/
+Fixes: 06fe39ab15a6a4 ("Btrfs: do not overwrite scrub error with fault error in scrub ioctl")
+CC: stable@vger.kernel.org # 5.1+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Tested-by: Graham Cobb <g.btrfs@cobb.uk.net>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/ioctl.c |   14 +++++++++++++-
+ 1 file changed, 13 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -4254,7 +4254,19 @@ static long btrfs_ioctl_scrub(struct fil
+                             &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
+                             0);
+-      if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
++      /*
++       * Copy scrub args to user space even if btrfs_scrub_dev() returned an
++       * error. This is important as it allows user space to know how much
++       * progress scrub has done. For example, if scrub is canceled we get
++       * -ECANCELED from btrfs_scrub_dev() and return that error back to user
++       * space. Later user space can inspect the progress from the structure
++       * btrfs_ioctl_scrub_args and resume scrub from where it left off
++       * previously (btrfs-progs does this).
++       * If we fail to copy the btrfs_ioctl_scrub_args structure to user space
++       * then return -EFAULT to signal the structure was not copied or it may
++       * be corrupt and unreliable due to a partial copy.
++       */
++      if (copy_to_user(arg, sa, sizeof(*sa)))
+               ret = -EFAULT;
+       if (!(sa->flags & BTRFS_SCRUB_READONLY))
diff --git a/queue-5.4/btrfs-check-rw_devices-not-num_devices-for-balance.patch b/queue-5.4/btrfs-check-rw_devices-not-num_devices-for-balance.patch
new file mode 100644 (file)
index 0000000..e19180b
--- /dev/null
@@ -0,0 +1,92 @@
+From b35cf1f0bf1f2b0b193093338414b9bd63b29015 Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Fri, 10 Jan 2020 11:11:24 -0500
+Subject: btrfs: check rw_devices, not num_devices for balance
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit b35cf1f0bf1f2b0b193093338414b9bd63b29015 upstream.
+
+The fstest btrfs/154 reports
+
+  [ 8675.381709] BTRFS: Transaction aborted (error -28)
+  [ 8675.383302] WARNING: CPU: 1 PID: 31900 at fs/btrfs/block-group.c:2038 btrfs_create_pending_block_groups+0x1e0/0x1f0 [btrfs]
+  [ 8675.390925] CPU: 1 PID: 31900 Comm: btrfs Not tainted 5.5.0-rc6-default+ #935
+  [ 8675.392780] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
+  [ 8675.395452] RIP: 0010:btrfs_create_pending_block_groups+0x1e0/0x1f0 [btrfs]
+  [ 8675.402672] RSP: 0018:ffffb2090888fb00 EFLAGS: 00010286
+  [ 8675.404413] RAX: 0000000000000000 RBX: ffff92026dfa91c8 RCX: 0000000000000001
+  [ 8675.406609] RDX: 0000000000000000 RSI: ffffffff8e100899 RDI: ffffffff8e100971
+  [ 8675.408775] RBP: ffff920247c61660 R08: 0000000000000000 R09: 0000000000000000
+  [ 8675.410978] R10: 0000000000000000 R11: 0000000000000000 R12: 00000000ffffffe4
+  [ 8675.412647] R13: ffff92026db74000 R14: ffff920247c616b8 R15: ffff92026dfbc000
+  [ 8675.413994] FS:  00007fd5e57248c0(0000) GS:ffff92027d800000(0000) knlGS:0000000000000000
+  [ 8675.416146] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+  [ 8675.417833] CR2: 0000564aa51682d8 CR3: 000000006dcbc004 CR4: 0000000000160ee0
+  [ 8675.419801] Call Trace:
+  [ 8675.420742]  btrfs_start_dirty_block_groups+0x355/0x480 [btrfs]
+  [ 8675.422600]  btrfs_commit_transaction+0xc8/0xaf0 [btrfs]
+  [ 8675.424335]  reset_balance_state+0x14a/0x190 [btrfs]
+  [ 8675.425824]  btrfs_balance.cold+0xe7/0x154 [btrfs]
+  [ 8675.427313]  ? kmem_cache_alloc_trace+0x235/0x2c0
+  [ 8675.428663]  btrfs_ioctl_balance+0x298/0x350 [btrfs]
+  [ 8675.430285]  btrfs_ioctl+0x466/0x2550 [btrfs]
+  [ 8675.431788]  ? mem_cgroup_charge_statistics+0x51/0xf0
+  [ 8675.433487]  ? mem_cgroup_commit_charge+0x56/0x400
+  [ 8675.435122]  ? do_raw_spin_unlock+0x4b/0xc0
+  [ 8675.436618]  ? _raw_spin_unlock+0x1f/0x30
+  [ 8675.438093]  ? __handle_mm_fault+0x499/0x740
+  [ 8675.439619]  ? do_vfs_ioctl+0x56e/0x770
+  [ 8675.441034]  do_vfs_ioctl+0x56e/0x770
+  [ 8675.442411]  ksys_ioctl+0x3a/0x70
+  [ 8675.443718]  ? trace_hardirqs_off_thunk+0x1a/0x1c
+  [ 8675.445333]  __x64_sys_ioctl+0x16/0x20
+  [ 8675.446705]  do_syscall_64+0x50/0x210
+  [ 8675.448059]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
+  [ 8675.479187] BTRFS: error (device vdb) in btrfs_create_pending_block_groups:2038: errno=-28 No space left
+
+We now use btrfs_can_overcommit() to see if we can flip a block group
+read only.  Before this would fail because we weren't taking into
+account the usable un-allocated space for allocating chunks.  With my
+patches we were allowed to do the balance, which is technically correct.
+
+The test is trying to start balance on degraded mount.  So now we're
+trying to allocate a chunk and cannot because we want to allocate a
+RAID1 chunk, but there's only 1 device that's available for usage.  This
+results in an ENOSPC.
+
+But we shouldn't even be making it this far, we don't have enough
+devices to restripe.  The problem is we're using btrfs_num_devices(),
+that also includes missing devices. That's not actually what we want, we
+need to use rw_devices.
+
+The chunk_mutex is not needed here, rw_devices changes only in device
+add, remove or replace, all are excluded by EXCL_OP mechanism.
+
+Fixes: e4d8ec0f65b9 ("Btrfs: implement online profile changing")
+CC: stable@vger.kernel.org # 4.4+
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+[ add stacktrace, update changelog, drop chunk_mutex ]
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/volumes.c |    6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -4066,7 +4066,11 @@ int btrfs_balance(struct btrfs_fs_info *
+               }
+       }
+-      num_devices = btrfs_num_devices(fs_info);
++      /*
++       * rw_devices will not change at the moment, device add/delete/replace
++       * are excluded by EXCL_OP
++       */
++      num_devices = fs_info->fs_devices->rw_devices;
+       /*
+        * SINGLE profile on-disk has no profile bit, but in-memory we have a
diff --git a/queue-5.4/btrfs-do-not-delete-mismatched-root-refs.patch b/queue-5.4/btrfs-do-not-delete-mismatched-root-refs.patch
new file mode 100644 (file)
index 0000000..add0eee
--- /dev/null
@@ -0,0 +1,45 @@
+From 423a716cd7be16fb08690760691befe3be97d3fc Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Wed, 18 Dec 2019 17:20:29 -0500
+Subject: btrfs: do not delete mismatched root refs
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit 423a716cd7be16fb08690760691befe3be97d3fc upstream.
+
+btrfs_del_root_ref() will simply WARN_ON() if the ref doesn't match in
+any way, and then continue to delete the reference.  This shouldn't
+happen, we have these values because there's more to the reference than
+the original root and the sub root.  If any of these checks fail, return
+-ENOENT.
+
+CC: stable@vger.kernel.org # 4.4+
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/root-tree.c |   10 ++++++----
+ 1 file changed, 6 insertions(+), 4 deletions(-)
+
+--- a/fs/btrfs/root-tree.c
++++ b/fs/btrfs/root-tree.c
+@@ -376,11 +376,13 @@ again:
+               leaf = path->nodes[0];
+               ref = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_root_ref);
+-
+-              WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
+-              WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
+               ptr = (unsigned long)(ref + 1);
+-              WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
++              if ((btrfs_root_ref_dirid(leaf, ref) != dirid) ||
++                  (btrfs_root_ref_name_len(leaf, ref) != name_len) ||
++                  memcmp_extent_buffer(leaf, name, ptr, name_len)) {
++                      err = -ENOENT;
++                      goto out;
++              }
+               *sequence = btrfs_root_ref_sequence(leaf, ref);
+               ret = btrfs_del_item(trans, tree_root, path);
diff --git a/queue-5.4/btrfs-fix-invalid-removal-of-root-ref.patch b/queue-5.4/btrfs-fix-invalid-removal-of-root-ref.patch
new file mode 100644 (file)
index 0000000..9bd2b2c
--- /dev/null
@@ -0,0 +1,89 @@
+From d49d3287e74ffe55ae7430d1e795e5f9bf7359ea Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Wed, 18 Dec 2019 17:20:28 -0500
+Subject: btrfs: fix invalid removal of root ref
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit d49d3287e74ffe55ae7430d1e795e5f9bf7359ea upstream.
+
+If we have the following sequence of events
+
+  btrfs sub create A
+  btrfs sub create A/B
+  btrfs sub snap A C
+  mkdir C/foo
+  mv A/B C/foo
+  rm -rf *
+
+We will end up with a transaction abort.
+
+The reason for this is because we create a root ref for B pointing to A.
+When we create a snapshot of C we still have B in our tree, but because
+the root ref points to A and not C we will make it appear to be empty.
+
+The problem happens when we move B into C.  This removes the root ref
+for B pointing to A and adds a ref of B pointing to C.  When we rmdir C
+we'll see that we have a ref to our root and remove the root ref,
+despite not actually matching our reference name.
+
+Now btrfs_del_root_ref() allowing this to work is a bug as well, however
+we know that this inode does not actually point to a root ref in the
+first place, so we shouldn't be calling btrfs_del_root_ref() in the
+first place and instead simply look up our dir index for this item and
+do the rest of the removal.
+
+CC: stable@vger.kernel.org # 4.4+
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/inode.c |   27 +++++++++++++++++++--------
+ 1 file changed, 19 insertions(+), 8 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -4248,13 +4248,16 @@ static int btrfs_unlink_subvol(struct bt
+       }
+       btrfs_release_path(path);
+-      ret = btrfs_del_root_ref(trans, objectid, root->root_key.objectid,
+-                               dir_ino, &index, name, name_len);
+-      if (ret < 0) {
+-              if (ret != -ENOENT) {
+-                      btrfs_abort_transaction(trans, ret);
+-                      goto out;
+-              }
++      /*
++       * This is a placeholder inode for a subvolume we didn't have a
++       * reference to at the time of the snapshot creation.  In the meantime
++       * we could have renamed the real subvol link into our snapshot, so
++       * depending on btrfs_del_root_ref to return -ENOENT here is incorret.
++       * Instead simply lookup the dir_index_item for this entry so we can
++       * remove it.  Otherwise we know we have a ref to the root and we can
++       * call btrfs_del_root_ref, and it _shouldn't_ fail.
++       */
++      if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
+               di = btrfs_search_dir_index_item(root, path, dir_ino,
+                                                name, name_len);
+               if (IS_ERR_OR_NULL(di)) {
+@@ -4269,8 +4272,16 @@ static int btrfs_unlink_subvol(struct bt
+               leaf = path->nodes[0];
+               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+               index = key.offset;
++              btrfs_release_path(path);
++      } else {
++              ret = btrfs_del_root_ref(trans, objectid,
++                                       root->root_key.objectid, dir_ino,
++                                       &index, name, name_len);
++              if (ret) {
++                      btrfs_abort_transaction(trans, ret);
++                      goto out;
++              }
+       }
+-      btrfs_release_path(path);
+       ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index);
+       if (ret) {
diff --git a/queue-5.4/btrfs-fix-memory-leak-in-qgroup-accounting.patch b/queue-5.4/btrfs-fix-memory-leak-in-qgroup-accounting.patch
new file mode 100644 (file)
index 0000000..e16f24c
--- /dev/null
@@ -0,0 +1,80 @@
+From 26ef8493e1ab771cb01d27defca2fa1315dc3980 Mon Sep 17 00:00:00 2001
+From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Date: Wed, 8 Jan 2020 21:07:32 +0900
+Subject: btrfs: fix memory leak in qgroup accounting
+
+From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+
+commit 26ef8493e1ab771cb01d27defca2fa1315dc3980 upstream.
+
+When running xfstests on the current btrfs I get the following splat from
+kmemleak:
+
+unreferenced object 0xffff88821b2404e0 (size 32):
+  comm "kworker/u4:7", pid 26663, jiffies 4295283698 (age 8.776s)
+  hex dump (first 32 bytes):
+    01 00 00 00 00 00 00 00 10 ff fd 26 82 88 ff ff  ...........&....
+    10 ff fd 26 82 88 ff ff 20 ff fd 26 82 88 ff ff  ...&.... ..&....
+  backtrace:
+    [<00000000f94fd43f>] ulist_alloc+0x25/0x60 [btrfs]
+    [<00000000fd023d99>] btrfs_find_all_roots_safe+0x41/0x100 [btrfs]
+    [<000000008f17bd32>] btrfs_find_all_roots+0x52/0x70 [btrfs]
+    [<00000000b7660afb>] btrfs_qgroup_rescan_worker+0x343/0x680 [btrfs]
+    [<0000000058e66778>] btrfs_work_helper+0xac/0x1e0 [btrfs]
+    [<00000000f0188930>] process_one_work+0x1cf/0x350
+    [<00000000af5f2f8e>] worker_thread+0x28/0x3c0
+    [<00000000b55a1add>] kthread+0x109/0x120
+    [<00000000f88cbd17>] ret_from_fork+0x35/0x40
+
+This corresponds to:
+
+  (gdb) l *(btrfs_find_all_roots_safe+0x41)
+  0x8d7e1 is in btrfs_find_all_roots_safe (fs/btrfs/backref.c:1413).
+  1408
+  1409            tmp = ulist_alloc(GFP_NOFS);
+  1410            if (!tmp)
+  1411                    return -ENOMEM;
+  1412            *roots = ulist_alloc(GFP_NOFS);
+  1413            if (!*roots) {
+  1414                    ulist_free(tmp);
+  1415                    return -ENOMEM;
+  1416            }
+  1417
+
+Following the lifetime of the allocated 'roots' ulist, it gets freed
+again in btrfs_qgroup_account_extent().
+
+But this does not happen if the function is called with the
+'BTRFS_FS_QUOTA_ENABLED' flag cleared, then btrfs_qgroup_account_extent()
+does a short leave and directly returns.
+
+Instead of directly returning we should jump to the 'out_free' in order to
+free all resources as expected.
+
+CC: stable@vger.kernel.org # 4.14+
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+[ add comment ]
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/qgroup.c |    6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -2423,8 +2423,12 @@ int btrfs_qgroup_account_extent(struct b
+       u64 nr_old_roots = 0;
+       int ret = 0;
++      /*
++       * If quotas get disabled meanwhile, the resouces need to be freed and
++       * we can't just exit here.
++       */
+       if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+-              return 0;
++              goto out_free;
+       if (new_roots) {
+               if (!maybe_fs_roots(new_roots))
diff --git a/queue-5.4/btrfs-relocation-fix-reloc_root-lifespan-and-access.patch b/queue-5.4/btrfs-relocation-fix-reloc_root-lifespan-and-access.patch
new file mode 100644 (file)
index 0000000..7892cc3
--- /dev/null
@@ -0,0 +1,234 @@
+From 6282675e6708ec78518cc0e9ad1f1f73d7c5c53d Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Wed, 8 Jan 2020 13:12:00 +0800
+Subject: btrfs: relocation: fix reloc_root lifespan and access
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 6282675e6708ec78518cc0e9ad1f1f73d7c5c53d upstream.
+
+[BUG]
+There are several different KASAN reports for balance + snapshot
+workloads.  Involved call paths include:
+
+   should_ignore_root+0x54/0xb0 [btrfs]
+   build_backref_tree+0x11af/0x2280 [btrfs]
+   relocate_tree_blocks+0x391/0xb80 [btrfs]
+   relocate_block_group+0x3e5/0xa00 [btrfs]
+   btrfs_relocate_block_group+0x240/0x4d0 [btrfs]
+   btrfs_relocate_chunk+0x53/0xf0 [btrfs]
+   btrfs_balance+0xc91/0x1840 [btrfs]
+   btrfs_ioctl_balance+0x416/0x4e0 [btrfs]
+   btrfs_ioctl+0x8af/0x3e60 [btrfs]
+   do_vfs_ioctl+0x831/0xb10
+
+   create_reloc_root+0x9f/0x460 [btrfs]
+   btrfs_reloc_post_snapshot+0xff/0x6c0 [btrfs]
+   create_pending_snapshot+0xa9b/0x15f0 [btrfs]
+   create_pending_snapshots+0x111/0x140 [btrfs]
+   btrfs_commit_transaction+0x7a6/0x1360 [btrfs]
+   btrfs_mksubvol+0x915/0x960 [btrfs]
+   btrfs_ioctl_snap_create_transid+0x1d5/0x1e0 [btrfs]
+   btrfs_ioctl_snap_create_v2+0x1d3/0x270 [btrfs]
+   btrfs_ioctl+0x241b/0x3e60 [btrfs]
+   do_vfs_ioctl+0x831/0xb10
+
+   btrfs_reloc_pre_snapshot+0x85/0xc0 [btrfs]
+   create_pending_snapshot+0x209/0x15f0 [btrfs]
+   create_pending_snapshots+0x111/0x140 [btrfs]
+   btrfs_commit_transaction+0x7a6/0x1360 [btrfs]
+   btrfs_mksubvol+0x915/0x960 [btrfs]
+   btrfs_ioctl_snap_create_transid+0x1d5/0x1e0 [btrfs]
+   btrfs_ioctl_snap_create_v2+0x1d3/0x270 [btrfs]
+   btrfs_ioctl+0x241b/0x3e60 [btrfs]
+   do_vfs_ioctl+0x831/0xb10
+
+[CAUSE]
+All these call sites are only relying on root->reloc_root, which can
+undergo btrfs_drop_snapshot(), and since we don't have real refcount
+based protection to reloc roots, we can reach already dropped reloc
+root, triggering KASAN.
+
+[FIX]
+To avoid such access to unstable root->reloc_root, we should check
+BTRFS_ROOT_DEAD_RELOC_TREE bit first.
+
+This patch introduces wrappers that provide the correct way to check the
+bit with memory barriers protection.
+
+Most callers don't distinguish merged reloc tree and no reloc tree.  The
+only exception is should_ignore_root(), as merged reloc tree can be
+ignored, while no reloc tree shouldn't.
+
+[CRITICAL SECTION ANALYSIS]
+Although test_bit()/set_bit()/clear_bit() doesn't imply a barrier, the
+DEAD_RELOC_TREE bit has extra help from transaction as a higher level
+barrier, the lifespan of root::reloc_root and DEAD_RELOC_TREE bit are:
+
+       NULL: reloc_root is NULL        PTR: reloc_root is not NULL
+       0: DEAD_RELOC_ROOT bit not set  DEAD: DEAD_RELOC_ROOT bit set
+
+       (NULL, 0)    Initial state               __
+         |                                      /\ Section A
+        btrfs_init_reloc_root()                         \/
+         |                                      __
+       (PTR, 0)     reloc_root initialized      /\
+          |                                     |
+       btrfs_update_reloc_root()                |  Section B
+          |                                     |
+       (PTR, DEAD)  reloc_root has been merged  \/
+          |                                     __
+       === btrfs_commit_transaction() ====================
+         |                                      /\
+       clean_dirty_subvols()                    |
+         |                                      |  Section C
+       (NULL, DEAD) reloc_root cleanup starts   \/
+          |                                     __
+       btrfs_drop_snapshot()                    /\
+         |                                      |  Section D
+       (NULL, 0)    Back to initial state       \/
+
+Every have_reloc_root() or test_bit(DEAD_RELOC_ROOT) caller holds
+transaction handle, so none of such caller can cross transaction boundary.
+
+In Section A, every caller just found no DEAD bit, and grab reloc_root.
+
+In the cross section A-B, caller may get no DEAD bit, but since reloc_root
+is still completely valid thus accessing reloc_root is completely safe.
+
+No test_bit() caller can cross the boundary of Section B and Section C.
+
+In Section C, every caller found the DEAD bit, so no one will access
+reloc_root.
+
+In the cross section C-D, either caller gets the DEAD bit set, avoiding
+access reloc_root no matter if it's safe or not.  Or caller get the DEAD
+bit cleared, then access reloc_root, which is already NULL, nothing will
+be wrong.
+
+The memory write barriers are between the reloc_root updates and bit
+set/clear, the pairing read side is before test_bit.
+
+Reported-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
+Fixes: d2311e698578 ("btrfs: relocation: Delay reloc tree deletion after merge_reloc_roots")
+CC: stable@vger.kernel.org # 5.4+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+[ barriers ]
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/relocation.c |   51 +++++++++++++++++++++++++++++++++++++++++++++-----
+ 1 file changed, 46 insertions(+), 5 deletions(-)
+
+--- a/fs/btrfs/relocation.c
++++ b/fs/btrfs/relocation.c
+@@ -517,6 +517,34 @@ static int update_backref_cache(struct b
+       return 1;
+ }
++static bool reloc_root_is_dead(struct btrfs_root *root)
++{
++      /*
++       * Pair with set_bit/clear_bit in clean_dirty_subvols and
++       * btrfs_update_reloc_root. We need to see the updated bit before
++       * trying to access reloc_root
++       */
++      smp_rmb();
++      if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
++              return true;
++      return false;
++}
++
++/*
++ * Check if this subvolume tree has valid reloc tree.
++ *
++ * Reloc tree after swap is considered dead, thus not considered as valid.
++ * This is enough for most callers, as they don't distinguish dead reloc root
++ * from no reloc root.  But should_ignore_root() below is a special case.
++ */
++static bool have_reloc_root(struct btrfs_root *root)
++{
++      if (reloc_root_is_dead(root))
++              return false;
++      if (!root->reloc_root)
++              return false;
++      return true;
++}
+ static int should_ignore_root(struct btrfs_root *root)
+ {
+@@ -525,6 +553,10 @@ static int should_ignore_root(struct btr
+       if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+               return 0;
++      /* This root has been merged with its reloc tree, we can ignore it */
++      if (reloc_root_is_dead(root))
++              return 1;
++
+       reloc_root = root->reloc_root;
+       if (!reloc_root)
+               return 0;
+@@ -1439,7 +1471,7 @@ int btrfs_init_reloc_root(struct btrfs_t
+        * The subvolume has reloc tree but the swap is finished, no need to
+        * create/update the dead reloc tree
+        */
+-      if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
++      if (reloc_root_is_dead(root))
+               return 0;
+       if (root->reloc_root) {
+@@ -1478,8 +1510,7 @@ int btrfs_update_reloc_root(struct btrfs
+       struct btrfs_root_item *root_item;
+       int ret;
+-      if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state) ||
+-          !root->reloc_root)
++      if (!have_reloc_root(root))
+               goto out;
+       reloc_root = root->reloc_root;
+@@ -1489,6 +1520,11 @@ int btrfs_update_reloc_root(struct btrfs
+       if (fs_info->reloc_ctl->merge_reloc_tree &&
+           btrfs_root_refs(root_item) == 0) {
+               set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
++              /*
++               * Mark the tree as dead before we change reloc_root so
++               * have_reloc_root will not touch it from now on.
++               */
++              smp_wmb();
+               __del_reloc_root(reloc_root);
+       }
+@@ -2202,6 +2238,11 @@ static int clean_dirty_subvols(struct re
+                               if (ret2 < 0 && !ret)
+                                       ret = ret2;
+                       }
++                      /*
++                       * Need barrier to ensure clear_bit() only happens after
++                       * root->reloc_root = NULL. Pairs with have_reloc_root.
++                       */
++                      smp_wmb();
+                       clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
+                       btrfs_put_fs_root(root);
+               } else {
+@@ -4721,7 +4762,7 @@ void btrfs_reloc_pre_snapshot(struct btr
+       struct btrfs_root *root = pending->root;
+       struct reloc_control *rc = root->fs_info->reloc_ctl;
+-      if (!root->reloc_root || !rc)
++      if (!rc || !have_reloc_root(root))
+               return;
+       if (!rc->merge_reloc_tree)
+@@ -4755,7 +4796,7 @@ int btrfs_reloc_post_snapshot(struct btr
+       struct reloc_control *rc = root->fs_info->reloc_ctl;
+       int ret;
+-      if (!root->reloc_root || !rc)
++      if (!rc || !have_reloc_root(root))
+               return 0;
+       rc = root->fs_info->reloc_ctl;
diff --git a/queue-5.4/mm-debug_pagealloc-don-t-rely-on-static-keys-too-early.patch b/queue-5.4/mm-debug_pagealloc-don-t-rely-on-static-keys-too-early.patch
new file mode 100644 (file)
index 0000000..774c371
--- /dev/null
@@ -0,0 +1,263 @@
+From 8e57f8acbbd121ecfb0c9dc13b8b030f86c6bd3b Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Mon, 13 Jan 2020 16:29:20 -0800
+Subject: mm, debug_pagealloc: don't rely on static keys too early
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit 8e57f8acbbd121ecfb0c9dc13b8b030f86c6bd3b upstream.
+
+Commit 96a2b03f281d ("mm, debug_pagelloc: use static keys to enable
+debugging") has introduced a static key to reduce overhead when
+debug_pagealloc is compiled in but not enabled.  It relied on the
+assumption that jump_label_init() is called before parse_early_param()
+as in start_kernel(), so when the "debug_pagealloc=on" option is parsed,
+it is safe to enable the static key.
+
+However, it turns out multiple architectures call parse_early_param()
+earlier from their setup_arch().  x86 also calls jump_label_init() even
+earlier, so no issue was found while testing the commit, but same is not
+true for e.g.  ppc64 and s390 where the kernel would not boot with
+debug_pagealloc=on as found by our QA.
+
+To fix this without tricky changes to init code of multiple
+architectures, this patch partially reverts the static key conversion
+from 96a2b03f281d.  Init-time and non-fastpath calls (such as in arch
+code) of debug_pagealloc_enabled() will again test a simple bool
+variable.  Fastpath mm code is converted to a new
+debug_pagealloc_enabled_static() variant that relies on the static key,
+which is enabled in a well-defined point in mm_init() where it's
+guaranteed that jump_label_init() has been called, regardless of
+architecture.
+
+[sfr@canb.auug.org.au: export _debug_pagealloc_enabled_early]
+  Link: http://lkml.kernel.org/r/20200106164944.063ac07b@canb.auug.org.au
+Link: http://lkml.kernel.org/r/20191219130612.23171-1-vbabka@suse.cz
+Fixes: 96a2b03f281d ("mm, debug_pagelloc: use static keys to enable debugging")
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Qian Cai <cai@lca.pw>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/mm.h |   18 +++++++++++++++---
+ init/main.c        |    1 +
+ mm/page_alloc.c    |   37 +++++++++++++------------------------
+ mm/slab.c          |    4 ++--
+ mm/slub.c          |    2 +-
+ mm/vmalloc.c       |    4 ++--
+ 6 files changed, 34 insertions(+), 32 deletions(-)
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -2666,14 +2666,26 @@ static inline bool want_init_on_free(voi
+              !page_poisoning_enabled();
+ }
+-#ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT
+-DECLARE_STATIC_KEY_TRUE(_debug_pagealloc_enabled);
++#ifdef CONFIG_DEBUG_PAGEALLOC
++extern void init_debug_pagealloc(void);
+ #else
+-DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
++static inline void init_debug_pagealloc(void) {}
+ #endif
++extern bool _debug_pagealloc_enabled_early;
++DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
+ static inline bool debug_pagealloc_enabled(void)
+ {
++      return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
++              _debug_pagealloc_enabled_early;
++}
++
++/*
++ * For use in fast paths after init_debug_pagealloc() has run, or when a
++ * false negative result is not harmful when called too early.
++ */
++static inline bool debug_pagealloc_enabled_static(void)
++{
+       if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC))
+               return false;
+--- a/init/main.c
++++ b/init/main.c
+@@ -553,6 +553,7 @@ static void __init mm_init(void)
+        * bigger than MAX_ORDER unless SPARSEMEM.
+        */
+       page_ext_init_flatmem();
++      init_debug_pagealloc();
+       report_meminit();
+       mem_init();
+       kmem_cache_init();
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -694,34 +694,27 @@ void prep_compound_page(struct page *pag
+ #ifdef CONFIG_DEBUG_PAGEALLOC
+ unsigned int _debug_guardpage_minorder;
+-#ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT
+-DEFINE_STATIC_KEY_TRUE(_debug_pagealloc_enabled);
+-#else
++bool _debug_pagealloc_enabled_early __read_mostly
++                      = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
++EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
+ DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
+-#endif
+ EXPORT_SYMBOL(_debug_pagealloc_enabled);
+ DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
+ static int __init early_debug_pagealloc(char *buf)
+ {
+-      bool enable = false;
+-
+-      if (kstrtobool(buf, &enable))
+-              return -EINVAL;
+-
+-      if (enable)
+-              static_branch_enable(&_debug_pagealloc_enabled);
+-
+-      return 0;
++      return kstrtobool(buf, &_debug_pagealloc_enabled_early);
+ }
+ early_param("debug_pagealloc", early_debug_pagealloc);
+-static void init_debug_guardpage(void)
++void init_debug_pagealloc(void)
+ {
+       if (!debug_pagealloc_enabled())
+               return;
++      static_branch_enable(&_debug_pagealloc_enabled);
++
+       if (!debug_guardpage_minorder())
+               return;
+@@ -1186,7 +1179,7 @@ static __always_inline bool free_pages_p
+        */
+       arch_free_page(page, order);
+-      if (debug_pagealloc_enabled())
++      if (debug_pagealloc_enabled_static())
+               kernel_map_pages(page, 1 << order, 0);
+       kasan_free_nondeferred_pages(page, order);
+@@ -1207,7 +1200,7 @@ static bool free_pcp_prepare(struct page
+ static bool bulkfree_pcp_prepare(struct page *page)
+ {
+-      if (debug_pagealloc_enabled())
++      if (debug_pagealloc_enabled_static())
+               return free_pages_check(page);
+       else
+               return false;
+@@ -1221,7 +1214,7 @@ static bool bulkfree_pcp_prepare(struct
+  */
+ static bool free_pcp_prepare(struct page *page)
+ {
+-      if (debug_pagealloc_enabled())
++      if (debug_pagealloc_enabled_static())
+               return free_pages_prepare(page, 0, true);
+       else
+               return free_pages_prepare(page, 0, false);
+@@ -1973,10 +1966,6 @@ void __init page_alloc_init_late(void)
+       for_each_populated_zone(zone)
+               set_zone_contiguous(zone);
+-
+-#ifdef CONFIG_DEBUG_PAGEALLOC
+-      init_debug_guardpage();
+-#endif
+ }
+ #ifdef CONFIG_CMA
+@@ -2106,7 +2095,7 @@ static inline bool free_pages_prezeroed(
+  */
+ static inline bool check_pcp_refill(struct page *page)
+ {
+-      if (debug_pagealloc_enabled())
++      if (debug_pagealloc_enabled_static())
+               return check_new_page(page);
+       else
+               return false;
+@@ -2128,7 +2117,7 @@ static inline bool check_pcp_refill(stru
+ }
+ static inline bool check_new_pcp(struct page *page)
+ {
+-      if (debug_pagealloc_enabled())
++      if (debug_pagealloc_enabled_static())
+               return check_new_page(page);
+       else
+               return false;
+@@ -2155,7 +2144,7 @@ inline void post_alloc_hook(struct page
+       set_page_refcounted(page);
+       arch_alloc_page(page, order);
+-      if (debug_pagealloc_enabled())
++      if (debug_pagealloc_enabled_static())
+               kernel_map_pages(page, 1 << order, 1);
+       kasan_alloc_pages(page, order);
+       kernel_poison_pages(page, 1 << order, 1);
+--- a/mm/slab.c
++++ b/mm/slab.c
+@@ -1415,7 +1415,7 @@ static void kmem_rcu_free(struct rcu_hea
+ #if DEBUG
+ static bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
+ {
+-      if (debug_pagealloc_enabled() && OFF_SLAB(cachep) &&
++      if (debug_pagealloc_enabled_static() && OFF_SLAB(cachep) &&
+               (cachep->size % PAGE_SIZE) == 0)
+               return true;
+@@ -2007,7 +2007,7 @@ int __kmem_cache_create(struct kmem_cach
+        * to check size >= 256. It guarantees that all necessary small
+        * sized slab is initialized in current slab initialization sequence.
+        */
+-      if (debug_pagealloc_enabled() && (flags & SLAB_POISON) &&
++      if (debug_pagealloc_enabled_static() && (flags & SLAB_POISON) &&
+               size >= 256 && cachep->object_size > cache_line_size()) {
+               if (size < PAGE_SIZE || size % PAGE_SIZE == 0) {
+                       size_t tmp_size = ALIGN(size, PAGE_SIZE);
+--- a/mm/slub.c
++++ b/mm/slub.c
+@@ -290,7 +290,7 @@ static inline void *get_freepointer_safe
+       unsigned long freepointer_addr;
+       void *p;
+-      if (!debug_pagealloc_enabled())
++      if (!debug_pagealloc_enabled_static())
+               return get_freepointer(s, object);
+       freepointer_addr = (unsigned long)object + s->offset;
+--- a/mm/vmalloc.c
++++ b/mm/vmalloc.c
+@@ -1349,7 +1349,7 @@ static void free_unmap_vmap_area(struct
+ {
+       flush_cache_vunmap(va->va_start, va->va_end);
+       unmap_vmap_area(va);
+-      if (debug_pagealloc_enabled())
++      if (debug_pagealloc_enabled_static())
+               flush_tlb_kernel_range(va->va_start, va->va_end);
+       free_vmap_area_noflush(va);
+@@ -1647,7 +1647,7 @@ static void vb_free(const void *addr, un
+       vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
+-      if (debug_pagealloc_enabled())
++      if (debug_pagealloc_enabled_static())
+               flush_tlb_kernel_range((unsigned long)addr,
+                                       (unsigned long)addr + size);
diff --git a/queue-5.4/mm-huge_memory.c-thp-fix-conflict-of-above-47bit-hint-address-and-pmd-alignment.patch b/queue-5.4/mm-huge_memory.c-thp-fix-conflict-of-above-47bit-hint-address-and-pmd-alignment.patch
new file mode 100644 (file)
index 0000000..c49ad4a
--- /dev/null
@@ -0,0 +1,134 @@
+From 97d3d0f9a1cf132c63c0b8b8bd497b8a56283dd9 Mon Sep 17 00:00:00 2001
+From: "Kirill A. Shutemov" <kirill@shutemov.name>
+Date: Mon, 13 Jan 2020 16:29:10 -0800
+Subject: mm/huge_memory.c: thp: fix conflict of above-47bit hint address and PMD alignment
+
+From: Kirill A. Shutemov <kirill@shutemov.name>
+
+commit 97d3d0f9a1cf132c63c0b8b8bd497b8a56283dd9 upstream.
+
+Patch series "Fix two above-47bit hint address vs.  THP bugs".
+
+The two get_unmapped_area() implementations have to be fixed to provide
+THP-friendly mappings if above-47bit hint address is specified.
+
+This patch (of 2):
+
+Filesystems use thp_get_unmapped_area() to provide THP-friendly
+mappings.  For DAX in particular.
+
+Normally, the kernel doesn't create userspace mappings above 47-bit,
+even if the machine allows this (such as with 5-level paging on x86-64).
+Not all user space is ready to handle wide addresses.  It's known that
+at least some JIT compilers use higher bits in pointers to encode their
+information.
+
+Userspace can ask for allocation from full address space by specifying
+hint address (with or without MAP_FIXED) above 47-bits.  If the
+application doesn't need a particular address, but wants to allocate
+from whole address space it can specify -1 as a hint address.
+
+Unfortunately, this trick breaks thp_get_unmapped_area(): the function
+would not try to allocate PMD-aligned area if *any* hint address
+specified.
+
+Modify the routine to handle it correctly:
+
+ - Try to allocate the space at the specified hint address with length
+   padding required for PMD alignment.
+ - If failed, retry without length padding (but with the same hint
+   address);
+ - If the returned address matches the hint address return it.
+ - Otherwise, align the address as required for THP and return.
+
+The user specified hint address is passed down to get_unmapped_area() so
+above-47bit hint address will be taken into account without breaking
+alignment requirements.
+
+Link: http://lkml.kernel.org/r/20191220142548.7118-2-kirill.shutemov@linux.intel.com
+Fixes: b569bab78d8d ("x86/mm: Prepare to expose larger address space to userspace")
+Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Reported-by: Thomas Willhalm <thomas.willhalm@intel.com>
+Tested-by: Dan Williams <dan.j.williams@intel.com>
+Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
+Cc: "Bruggeman, Otto G" <otto.g.bruggeman@intel.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/huge_memory.c |   38 ++++++++++++++++++++++++--------------
+ 1 file changed, 24 insertions(+), 14 deletions(-)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -527,13 +527,13 @@ void prep_transhuge_page(struct page *pa
+       set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
+ }
+-static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
++static unsigned long __thp_get_unmapped_area(struct file *filp,
++              unsigned long addr, unsigned long len,
+               loff_t off, unsigned long flags, unsigned long size)
+ {
+-      unsigned long addr;
+       loff_t off_end = off + len;
+       loff_t off_align = round_up(off, size);
+-      unsigned long len_pad;
++      unsigned long len_pad, ret;
+       if (off_end <= off_align || (off_end - off_align) < size)
+               return 0;
+@@ -542,30 +542,40 @@ static unsigned long __thp_get_unmapped_
+       if (len_pad < len || (off + len_pad) < off)
+               return 0;
+-      addr = current->mm->get_unmapped_area(filp, 0, len_pad,
++      ret = current->mm->get_unmapped_area(filp, addr, len_pad,
+                                             off >> PAGE_SHIFT, flags);
+-      if (IS_ERR_VALUE(addr))
++
++      /*
++       * The failure might be due to length padding. The caller will retry
++       * without the padding.
++       */
++      if (IS_ERR_VALUE(ret))
+               return 0;
+-      addr += (off - addr) & (size - 1);
+-      return addr;
++      /*
++       * Do not try to align to THP boundary if allocation at the address
++       * hint succeeds.
++       */
++      if (ret == addr)
++              return addr;
++
++      ret += (off - ret) & (size - 1);
++      return ret;
+ }
+ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
+               unsigned long len, unsigned long pgoff, unsigned long flags)
+ {
++      unsigned long ret;
+       loff_t off = (loff_t)pgoff << PAGE_SHIFT;
+-      if (addr)
+-              goto out;
+       if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
+               goto out;
+-      addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE);
+-      if (addr)
+-              return addr;
+-
+- out:
++      ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
++      if (ret)
++              return ret;
++out:
+       return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+ }
+ EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
diff --git a/queue-5.4/mm-khugepaged-add-trace-status-description-for-scan_page_has_private.patch b/queue-5.4/mm-khugepaged-add-trace-status-description-for-scan_page_has_private.patch
new file mode 100644 (file)
index 0000000..3921f2e
--- /dev/null
@@ -0,0 +1,39 @@
+From 554913f600b45d73de12ad58c1ac7baa0f22a703 Mon Sep 17 00:00:00 2001
+From: Yang Shi <yang.shi@linux.alibaba.com>
+Date: Mon, 13 Jan 2020 16:29:36 -0800
+Subject: mm: khugepaged: add trace status description for SCAN_PAGE_HAS_PRIVATE
+
+From: Yang Shi <yang.shi@linux.alibaba.com>
+
+commit 554913f600b45d73de12ad58c1ac7baa0f22a703 upstream.
+
+Commit 99cb0dbd47a1 ("mm,thp: add read-only THP support for (non-shmem)
+FS") introduced a new khugepaged scan result: SCAN_PAGE_HAS_PRIVATE, but
+the corresponding description for trace events were not added.
+
+Link: http://lkml.kernel.org/r/1574793844-2914-1-git-send-email-yang.shi@linux.alibaba.com
+Fixes: 99cb0dbd47a1 ("mm,thp: add read-only THP support for (non-shmem) FS")
+Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
+Cc: Song Liu <songliubraving@fb.com>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Anshuman Khandual <anshuman.khandual@arm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/trace/events/huge_memory.h |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/include/trace/events/huge_memory.h
++++ b/include/trace/events/huge_memory.h
+@@ -31,7 +31,8 @@
+       EM( SCAN_ALLOC_HUGE_PAGE_FAIL,  "alloc_huge_page_failed")       \
+       EM( SCAN_CGROUP_CHARGE_FAIL,    "ccgroup_charge_failed")        \
+       EM( SCAN_EXCEED_SWAP_PTE,       "exceed_swap_pte")              \
+-      EMe(SCAN_TRUNCATED,             "truncated")                    \
++      EM( SCAN_TRUNCATED,             "truncated")                    \
++      EMe(SCAN_PAGE_HAS_PRIVATE,      "page_has_private")             \
+ #undef EM
+ #undef EMe
diff --git a/queue-5.4/mm-memcg-slab-call-flush_memcg_workqueue-only-if-memcg-workqueue-is-valid.patch b/queue-5.4/mm-memcg-slab-call-flush_memcg_workqueue-only-if-memcg-workqueue-is-valid.patch
new file mode 100644 (file)
index 0000000..b507d78
--- /dev/null
@@ -0,0 +1,85 @@
+From 2fe20210fc5f5e62644678b8f927c49f2c6f42a7 Mon Sep 17 00:00:00 2001
+From: Adrian Huang <ahuang12@lenovo.com>
+Date: Mon, 13 Jan 2020 16:29:32 -0800
+Subject: mm: memcg/slab: call flush_memcg_workqueue() only if memcg workqueue is valid
+
+From: Adrian Huang <ahuang12@lenovo.com>
+
+commit 2fe20210fc5f5e62644678b8f927c49f2c6f42a7 upstream.
+
+When booting with amd_iommu=off, the following WARNING message
+appears:
+
+  AMD-Vi: AMD IOMMU disabled on kernel command-line
+  ------------[ cut here ]------------
+  WARNING: CPU: 0 PID: 0 at kernel/workqueue.c:2772 flush_workqueue+0x42e/0x450
+  Modules linked in:
+  CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.5.0-rc3-amd-iommu #6
+  Hardware name: Lenovo ThinkSystem SR655-2S/7D2WRCZ000, BIOS D8E101L-1.00 12/05/2019
+  RIP: 0010:flush_workqueue+0x42e/0x450
+  Code: ff 0f 0b e9 7a fd ff ff 4d 89 ef e9 33 fe ff ff 0f 0b e9 7f fd ff ff 0f 0b e9 bc fd ff ff 0f 0b e9 a8 fd ff ff e8 52 2c fe ff <0f> 0b 31 d2 48 c7 c6 e0 88 c5 95 48 c7 c7 d8 ad f0 95 e8 19 f5 04
+  Call Trace:
+   kmem_cache_destroy+0x69/0x260
+   iommu_go_to_state+0x40c/0x5ab
+   amd_iommu_prepare+0x16/0x2a
+   irq_remapping_prepare+0x36/0x5f
+   enable_IR_x2apic+0x21/0x172
+   default_setup_apic_routing+0x12/0x6f
+   apic_intr_mode_init+0x1a1/0x1f1
+   x86_late_time_init+0x17/0x1c
+   start_kernel+0x480/0x53f
+   secondary_startup_64+0xb6/0xc0
+  ---[ end trace 30894107c3749449 ]---
+  x2apic: IRQ remapping doesn't support X2APIC mode
+  x2apic disabled
+
+The warning is caused by the calling of 'kmem_cache_destroy()'
+in free_iommu_resources(). Here is the call path:
+
+  free_iommu_resources
+    kmem_cache_destroy
+      flush_memcg_workqueue
+        flush_workqueue
+
+The root cause is that the IOMMU subsystem runs before the workqueue
+subsystem, which the variable 'wq_online' is still 'false'.  This leads
+to the statement 'if (WARN_ON(!wq_online))' in flush_workqueue() is
+'true'.
+
+Since the variable 'memcg_kmem_cache_wq' is not allocated during the
+time, it is unnecessary to call flush_memcg_workqueue().  This prevents
+the WARNING message triggered by flush_workqueue().
+
+Link: http://lkml.kernel.org/r/20200103085503.1665-1-ahuang12@lenovo.com
+Fixes: 92ee383f6daab ("mm: fix race between kmem_cache destroy, create and deactivate")
+Signed-off-by: Adrian Huang <ahuang12@lenovo.com>
+Reported-by: Xiaochun Lee <lixc17@lenovo.com>
+Reviewed-by: Shakeel Butt <shakeelb@google.com>
+Cc: Joerg Roedel <jroedel@suse.de>
+Cc: Christoph Lameter <cl@linux.com>
+Cc: Pekka Enberg <penberg@kernel.org>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/slab_common.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/mm/slab_common.c
++++ b/mm/slab_common.c
+@@ -903,7 +903,8 @@ static void flush_memcg_workqueue(struct
+        * deactivates the memcg kmem_caches through workqueue. Make sure all
+        * previous workitems on workqueue are processed.
+        */
+-      flush_workqueue(memcg_kmem_cache_wq);
++      if (likely(memcg_kmem_cache_wq))
++              flush_workqueue(memcg_kmem_cache_wq);
+       /*
+        * If we're racing with children kmem_cache deactivation, it might
diff --git a/queue-5.4/mm-memcg-slab-fix-percpu-slab-vmstats-flushing.patch b/queue-5.4/mm-memcg-slab-fix-percpu-slab-vmstats-flushing.patch
new file mode 100644 (file)
index 0000000..6d4913b
--- /dev/null
@@ -0,0 +1,178 @@
+From 4a87e2a25dc27131c3cce5e94421622193305638 Mon Sep 17 00:00:00 2001
+From: Roman Gushchin <guro@fb.com>
+Date: Mon, 13 Jan 2020 16:29:16 -0800
+Subject: mm: memcg/slab: fix percpu slab vmstats flushing
+
+From: Roman Gushchin <guro@fb.com>
+
+commit 4a87e2a25dc27131c3cce5e94421622193305638 upstream.
+
+Currently slab percpu vmstats are flushed twice: during the memcg
+offlining and just before freeing the memcg structure.  Each time percpu
+counters are summed, added to the atomic counterparts and propagated up
+by the cgroup tree.
+
+The second flushing is required due to how recursive vmstats are
+implemented: counters are batched in percpu variables on a local level,
+and once a percpu value is crossing some predefined threshold, it spills
+over to atomic values on the local and each ascendant levels.  It means
+that without flushing some numbers cached in percpu variables will be
+dropped on floor each time a cgroup is destroyed.  And with uptime the
+error on upper levels might become noticeable.
+
+The first flushing aims to make counters on ancestor levels more
+precise.  Dying cgroups may resume in the dying state for a long time.
+After kmem_cache reparenting which is performed during the offlining
+slab counters of the dying cgroup don't have any chances to be updated,
+because any slab operations will be performed on the parent level.  It
+means that the inaccuracy caused by percpu batching will not decrease up
+to the final destruction of the cgroup.  By the original idea flushing
+slab counters during the offlining should minimize the visible
+inaccuracy of slab counters on the parent level.
+
+The problem is that percpu counters are not zeroed after the first
+flushing.  So every cached percpu value is summed twice.  It creates a
+small error (up to 32 pages per cpu, but usually less) which accumulates
+on parent cgroup level.  After creating and destroying of thousands of
+child cgroups, slab counter on parent level can be way off the real
+value.
+
+For now, let's just stop flushing slab counters on memcg offlining.  It
+can't be done correctly without scheduling a work on each cpu: reading
+and zeroing it during css offlining can race with an asynchronous
+update, which doesn't expect values to be changed underneath.
+
+With this change, slab counters on parent level will become eventually
+consistent.  Once all dying children are gone, values are correct.  And
+if not, the error is capped by 32 * NR_CPUS pages per dying cgroup.
+
+It's not perfect, as slab are reparented, so any updates after the
+reparenting will happen on the parent level.  It means that if a slab
+page was allocated, a counter on child level was bumped, then the page
+was reparented and freed, the annihilation of positive and negative
+counter values will not happen until the child cgroup is released.  It
+makes slab counters different from others, and it might want us to
+implement flushing in a correct form again.  But it's also a question of
+performance: scheduling a work on each cpu isn't free, and it's an open
+question if the benefit of having more accurate counters is worth it.
+
+We might also consider flushing all counters on offlining, not only slab
+counters.
+
+So let's fix the main problem now: make the slab counters eventually
+consistent, so at least the error won't grow with uptime (or more
+precisely the number of created and destroyed cgroups).  And think about
+the accuracy of counters separately.
+
+Link: http://lkml.kernel.org/r/20191220042728.1045881-1-guro@fb.com
+Fixes: bee07b33db78 ("mm: memcontrol: flush percpu slab vmstats on kmem offlining")
+Signed-off-by: Roman Gushchin <guro@fb.com>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/mmzone.h |    5 ++---
+ mm/memcontrol.c        |   37 +++++++++----------------------------
+ 2 files changed, 11 insertions(+), 31 deletions(-)
+
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -215,9 +215,8 @@ enum node_stat_item {
+       NR_INACTIVE_FILE,       /*  "     "     "   "       "         */
+       NR_ACTIVE_FILE,         /*  "     "     "   "       "         */
+       NR_UNEVICTABLE,         /*  "     "     "   "       "         */
+-      NR_SLAB_RECLAIMABLE,    /* Please do not reorder this item */
+-      NR_SLAB_UNRECLAIMABLE,  /* and this one without looking at
+-                               * memcg_flush_percpu_vmstats() first. */
++      NR_SLAB_RECLAIMABLE,
++      NR_SLAB_UNRECLAIMABLE,
+       NR_ISOLATED_ANON,       /* Temporary isolated pages from anon lru */
+       NR_ISOLATED_FILE,       /* Temporary isolated pages from file lru */
+       WORKINGSET_NODES,
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -3404,49 +3404,34 @@ static u64 mem_cgroup_read_u64(struct cg
+       }
+ }
+-static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only)
++static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
+ {
+-      unsigned long stat[MEMCG_NR_STAT];
++      unsigned long stat[MEMCG_NR_STAT] = {0};
+       struct mem_cgroup *mi;
+       int node, cpu, i;
+-      int min_idx, max_idx;
+-
+-      if (slab_only) {
+-              min_idx = NR_SLAB_RECLAIMABLE;
+-              max_idx = NR_SLAB_UNRECLAIMABLE;
+-      } else {
+-              min_idx = 0;
+-              max_idx = MEMCG_NR_STAT;
+-      }
+-
+-      for (i = min_idx; i < max_idx; i++)
+-              stat[i] = 0;
+       for_each_online_cpu(cpu)
+-              for (i = min_idx; i < max_idx; i++)
++              for (i = 0; i < MEMCG_NR_STAT; i++)
+                       stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
+       for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+-              for (i = min_idx; i < max_idx; i++)
++              for (i = 0; i < MEMCG_NR_STAT; i++)
+                       atomic_long_add(stat[i], &mi->vmstats[i]);
+-      if (!slab_only)
+-              max_idx = NR_VM_NODE_STAT_ITEMS;
+-
+       for_each_node(node) {
+               struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
+               struct mem_cgroup_per_node *pi;
+-              for (i = min_idx; i < max_idx; i++)
++              for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+                       stat[i] = 0;
+               for_each_online_cpu(cpu)
+-                      for (i = min_idx; i < max_idx; i++)
++                      for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+                               stat[i] += per_cpu(
+                                       pn->lruvec_stat_cpu->count[i], cpu);
+               for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
+-                      for (i = min_idx; i < max_idx; i++)
++                      for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+                               atomic_long_add(stat[i], &pi->lruvec_stat[i]);
+       }
+ }
+@@ -3520,13 +3505,9 @@ static void memcg_offline_kmem(struct me
+               parent = root_mem_cgroup;
+       /*
+-       * Deactivate and reparent kmem_caches. Then flush percpu
+-       * slab statistics to have precise values at the parent and
+-       * all ancestor levels. It's required to keep slab stats
+-       * accurate after the reparenting of kmem_caches.
++       * Deactivate and reparent kmem_caches.
+        */
+       memcg_deactivate_kmem_caches(memcg, parent);
+-      memcg_flush_percpu_vmstats(memcg, true);
+       kmemcg_id = memcg->kmemcg_id;
+       BUG_ON(kmemcg_id < 0);
+@@ -5037,7 +5018,7 @@ static void mem_cgroup_free(struct mem_c
+        * Flush percpu vmstats and vmevents to guarantee the value correctness
+        * on parent's and all ancestor levels.
+        */
+-      memcg_flush_percpu_vmstats(memcg, false);
++      memcg_flush_percpu_vmstats(memcg);
+       memcg_flush_percpu_vmevents(memcg);
+       __mem_cgroup_free(memcg);
+ }
diff --git a/queue-5.4/mm-memory_hotplug-don-t-free-usage-map-when-removing-a-re-added-early-section.patch b/queue-5.4/mm-memory_hotplug-don-t-free-usage-map-when-removing-a-re-added-early-section.patch
new file mode 100644 (file)
index 0000000..621679f
--- /dev/null
@@ -0,0 +1,97 @@
+From 8068df3b60373c390198f660574ea14c8098de57 Mon Sep 17 00:00:00 2001
+From: David Hildenbrand <david@redhat.com>
+Date: Mon, 13 Jan 2020 16:29:07 -0800
+Subject: mm/memory_hotplug: don't free usage map when removing a re-added early section
+
+From: David Hildenbrand <david@redhat.com>
+
+commit 8068df3b60373c390198f660574ea14c8098de57 upstream.
+
+When we remove an early section, we don't free the usage map, as the
+usage maps of other sections are placed into the same page.  Once the
+section is removed, it is no longer an early section (especially, the
+memmap is freed).  When we re-add that section, the usage map is reused,
+however, it is no longer an early section.  When removing that section
+again, we try to kfree() a usage map that was allocated during early
+boot - bad.
+
+Let's check against PageReserved() to see if we are dealing with an
+usage map that was allocated during boot.  We could also check against
+!(PageSlab(usage_page) || PageCompound(usage_page)), but PageReserved() is
+cleaner.
+
+Can be triggered using memtrace under ppc64/powernv:
+
+  $ mount -t debugfs none /sys/kernel/debug/
+  $ echo 0x20000000 > /sys/kernel/debug/powerpc/memtrace/enable
+  $ echo 0x20000000 > /sys/kernel/debug/powerpc/memtrace/enable
+   ------------[ cut here ]------------
+   kernel BUG at mm/slub.c:3969!
+   Oops: Exception in kernel mode, sig: 5 [#1]
+   LE PAGE_SIZE=3D64K MMU=3DHash SMP NR_CPUS=3D2048 NUMA PowerNV
+   Modules linked in:
+   CPU: 0 PID: 154 Comm: sh Not tainted 5.5.0-rc2-next-20191216-00005-g0be1dba7b7c0 #61
+   NIP kfree+0x338/0x3b0
+   LR section_deactivate+0x138/0x200
+   Call Trace:
+     section_deactivate+0x138/0x200
+     __remove_pages+0x114/0x150
+     arch_remove_memory+0x3c/0x160
+     try_remove_memory+0x114/0x1a0
+     __remove_memory+0x20/0x40
+     memtrace_enable_set+0x254/0x850
+     simple_attr_write+0x138/0x160
+     full_proxy_write+0x8c/0x110
+     __vfs_write+0x38/0x70
+     vfs_write+0x11c/0x2a0
+     ksys_write+0x84/0x140
+     system_call+0x5c/0x68
+   ---[ end trace 4b053cbd84e0db62 ]---
+
+The first invocation will offline+remove memory blocks.  The second
+invocation will first add+online them again, in order to offline+remove
+them again (usually we are lucky and the exact same memory blocks will
+get "reallocated").
+
+Tested on powernv with boot memory: The usage map will not get freed.
+Tested on x86-64 with DIMMs: The usage map will get freed.
+
+Using Dynamic Memory under a Power DLAPR can trigger it easily.
+
+Triggering removal (I assume after previously removed+re-added) of
+memory from the HMC GUI can crash the kernel with the same call trace
+and is fixed by this patch.
+
+Link: http://lkml.kernel.org/r/20191217104637.5509-1-david@redhat.com
+Fixes: 326e1b8f83a4 ("mm/sparsemem: introduce a SECTION_IS_EARLY flag")
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Tested-by: Pingfan Liu <piliu@redhat.com>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: Michal Hocko <mhocko@kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/sparse.c |    9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+--- a/mm/sparse.c
++++ b/mm/sparse.c
+@@ -775,7 +775,14 @@ static void section_deactivate(unsigned
+       if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) {
+               unsigned long section_nr = pfn_to_section_nr(pfn);
+-              if (!section_is_early) {
++              /*
++               * When removing an early section, the usage map is kept (as the
++               * usage maps of other sections fall into the same page). It
++               * will be re-used when re-adding the section - which is then no
++               * longer an early section. If the usage map is PageReserved, it
++               * was allocated during boot.
++               */
++              if (!PageReserved(virt_to_page(ms->usage))) {
+                       kfree(ms->usage);
+                       ms->usage = NULL;
+               }
diff --git a/queue-5.4/mm-page-writeback.c-avoid-potential-division-by-zero-in-wb_min_max_ratio.patch b/queue-5.4/mm-page-writeback.c-avoid-potential-division-by-zero-in-wb_min_max_ratio.patch
new file mode 100644 (file)
index 0000000..459b8e5
--- /dev/null
@@ -0,0 +1,79 @@
+From 6d9e8c651dd979aa666bee15f086745f3ea9c4b3 Mon Sep 17 00:00:00 2001
+From: Wen Yang <wenyang@linux.alibaba.com>
+Date: Mon, 13 Jan 2020 16:29:23 -0800
+Subject: mm/page-writeback.c: avoid potential division by zero in wb_min_max_ratio()
+
+From: Wen Yang <wenyang@linux.alibaba.com>
+
+commit 6d9e8c651dd979aa666bee15f086745f3ea9c4b3 upstream.
+
+Patch series "use div64_ul() instead of div_u64() if the divisor is
+unsigned long".
+
+We were first inspired by commit b0ab99e7736a ("sched: Fix possible divide
+by zero in avg_atom () calculation"), then refer to the recently analyzed
+mm code, we found this suspicious place.
+
+ 201                 if (min) {
+ 202                         min *= this_bw;
+ 203                         do_div(min, tot_bw);
+ 204                 }
+
+And we also disassembled and confirmed it:
+
+  /usr/src/debug/kernel-4.9.168-016.ali3000/linux-4.9.168-016.ali3000.alios7.x86_64/mm/page-writeback.c: 201
+  0xffffffff811c37da <__wb_calc_thresh+234>:      xor    %r10d,%r10d
+  0xffffffff811c37dd <__wb_calc_thresh+237>:      test   %rax,%rax
+  0xffffffff811c37e0 <__wb_calc_thresh+240>:      je 0xffffffff811c3800 <__wb_calc_thresh+272>
+  /usr/src/debug/kernel-4.9.168-016.ali3000/linux-4.9.168-016.ali3000.alios7.x86_64/mm/page-writeback.c: 202
+  0xffffffff811c37e2 <__wb_calc_thresh+242>:      imul   %r8,%rax
+  /usr/src/debug/kernel-4.9.168-016.ali3000/linux-4.9.168-016.ali3000.alios7.x86_64/mm/page-writeback.c: 203
+  0xffffffff811c37e6 <__wb_calc_thresh+246>:      mov    %r9d,%r10d    ---> truncates it to 32 bits here
+  0xffffffff811c37e9 <__wb_calc_thresh+249>:      xor    %edx,%edx
+  0xffffffff811c37eb <__wb_calc_thresh+251>:      div    %r10
+  0xffffffff811c37ee <__wb_calc_thresh+254>:      imul   %rbx,%rax
+  0xffffffff811c37f2 <__wb_calc_thresh+258>:      shr    $0x2,%rax
+  0xffffffff811c37f6 <__wb_calc_thresh+262>:      mul    %rcx
+  0xffffffff811c37f9 <__wb_calc_thresh+265>:      shr    $0x2,%rdx
+  0xffffffff811c37fd <__wb_calc_thresh+269>:      mov    %rdx,%r10
+
+This series uses div64_ul() instead of div_u64() if the divisor is
+unsigned long, to avoid truncation to 32-bit on 64-bit platforms.
+
+This patch (of 3):
+
+The variables 'min' and 'max' are unsigned long and do_div truncates
+them to 32 bits, which means it can test non-zero and be truncated to
+zero for division.  Fix this issue by using div64_ul() instead.
+
+Link: http://lkml.kernel.org/r/20200102081442.8273-2-wenyang@linux.alibaba.com
+Fixes: 693108a8a667 ("writeback: make bdi->min/max_ratio handling cgroup writeback aware")
+Signed-off-by: Wen Yang <wenyang@linux.alibaba.com>
+Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
+Cc: Qian Cai <cai@lca.pw>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page-writeback.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -201,11 +201,11 @@ static void wb_min_max_ratio(struct bdi_
+       if (this_bw < tot_bw) {
+               if (min) {
+                       min *= this_bw;
+-                      do_div(min, tot_bw);
++                      min = div64_ul(min, tot_bw);
+               }
+               if (max < 100) {
+                       max *= this_bw;
+-                      do_div(max, tot_bw);
++                      max = div64_ul(max, tot_bw);
+               }
+       }
diff --git a/queue-5.4/mm-shmem.c-thp-shmem-fix-conflict-of-above-47bit-hint-address-and-pmd-alignment.patch b/queue-5.4/mm-shmem.c-thp-shmem-fix-conflict-of-above-47bit-hint-address-and-pmd-alignment.patch
new file mode 100644 (file)
index 0000000..d788d4f
--- /dev/null
@@ -0,0 +1,74 @@
+From 991589974d9c9ecb24ee3799ec8c415c730598a2 Mon Sep 17 00:00:00 2001
+From: "Kirill A. Shutemov" <kirill@shutemov.name>
+Date: Mon, 13 Jan 2020 16:29:13 -0800
+Subject: mm/shmem.c: thp, shmem: fix conflict of above-47bit hint address and PMD alignment
+
+From: Kirill A. Shutemov <kirill@shutemov.name>
+
+commit 991589974d9c9ecb24ee3799ec8c415c730598a2 upstream.
+
+Shmem/tmpfs tries to provide THP-friendly mappings if huge pages are
+enabled.  But it doesn't work well with above-47bit hint address.
+
+Normally, the kernel doesn't create userspace mappings above 47-bit,
+even if the machine allows this (such as with 5-level paging on x86-64).
+Not all user space is ready to handle wide addresses.  It's known that
+at least some JIT compilers use higher bits in pointers to encode their
+information.
+
+Userspace can ask for allocation from full address space by specifying
+hint address (with or without MAP_FIXED) above 47-bits.  If the
+application doesn't need a particular address, but wants to allocate
+from whole address space it can specify -1 as a hint address.
+
+Unfortunately, this trick breaks THP alignment in shmem/tmp:
+shmem_get_unmapped_area() would not try to allocate PMD-aligned area if
+*any* hint address specified.
+
+This can be fixed by requesting the aligned area if the we failed to
+allocated at user-specified hint address.  The request with inflated
+length will also take the user-specified hint address.  This way we will
+not lose an allocation request from the full address space.
+
+[kirill@shutemov.name: fold in a fixup]
+  Link: http://lkml.kernel.org/r/20191223231309.t6bh5hkbmokihpfu@box
+Link: http://lkml.kernel.org/r/20191220142548.7118-3-kirill.shutemov@linux.intel.com
+Fixes: b569bab78d8d ("x86/mm: Prepare to expose larger address space to userspace")
+Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: "Willhalm, Thomas" <thomas.willhalm@intel.com>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: "Bruggeman, Otto G" <otto.g.bruggeman@intel.com>
+Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/shmem.c |    7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+--- a/mm/shmem.c
++++ b/mm/shmem.c
+@@ -2106,9 +2106,10 @@ unsigned long shmem_get_unmapped_area(st
+       /*
+        * Our priority is to support MAP_SHARED mapped hugely;
+        * and support MAP_PRIVATE mapped hugely too, until it is COWed.
+-       * But if caller specified an address hint, respect that as before.
++       * But if caller specified an address hint and we allocated area there
++       * successfully, respect that as before.
+        */
+-      if (uaddr)
++      if (uaddr == addr)
+               return addr;
+       if (shmem_huge != SHMEM_HUGE_FORCE) {
+@@ -2142,7 +2143,7 @@ unsigned long shmem_get_unmapped_area(st
+       if (inflated_len < len)
+               return addr;
+-      inflated_addr = get_area(NULL, 0, inflated_len, 0, flags);
++      inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags);
+       if (IS_ERR_VALUE(inflated_addr))
+               return addr;
+       if (inflated_addr & ~PAGE_MASK)
diff --git a/queue-5.4/net-fix-kernel-doc-warning-in-linux-netdevice.h.patch b/queue-5.4/net-fix-kernel-doc-warning-in-linux-netdevice.h.patch
new file mode 100644 (file)
index 0000000..b292fd2
--- /dev/null
@@ -0,0 +1,34 @@
+From 1f26c0d3d24125992ab0026b0dab16c08df947c7 Mon Sep 17 00:00:00 2001
+From: Randy Dunlap <rdunlap@infradead.org>
+Date: Mon, 16 Dec 2019 18:52:45 -0800
+Subject: net: fix kernel-doc warning in <linux/netdevice.h>
+
+From: Randy Dunlap <rdunlap@infradead.org>
+
+commit 1f26c0d3d24125992ab0026b0dab16c08df947c7 upstream.
+
+Fix missing '*' kernel-doc notation that causes this warning:
+
+../include/linux/netdevice.h:1779: warning: bad line:                                 spinlock
+
+Fixes: ab92d68fc22f ("net: core: add generic lockdep keys")
+Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
+Cc: Taehee Yoo <ap420073@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/netdevice.h |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/include/linux/netdevice.h
++++ b/include/linux/netdevice.h
+@@ -1761,7 +1761,7 @@ enum netdev_priv_flags {
+  *                    for hardware timestamping
+  *    @sfp_bus:       attached &struct sfp_bus structure.
+  *    @qdisc_tx_busylock_key: lockdep class annotating Qdisc->busylock
+-                              spinlock
++ *                            spinlock
+  *    @qdisc_running_key:     lockdep class annotating Qdisc->running seqcount
+  *    @qdisc_xmit_lock_key:   lockdep class annotating
+  *                            netdev_queue->_xmit_lock spinlock
diff --git a/queue-5.4/net-stmmac-16kb-buffer-must-be-16-byte-aligned.patch b/queue-5.4/net-stmmac-16kb-buffer-must-be-16-byte-aligned.patch
new file mode 100644 (file)
index 0000000..70c878d
--- /dev/null
@@ -0,0 +1,34 @@
+From 8605131747e7e1fd8f6c9f97a00287aae2b2c640 Mon Sep 17 00:00:00 2001
+From: Jose Abreu <Jose.Abreu@synopsys.com>
+Date: Wed, 18 Dec 2019 11:17:41 +0100
+Subject: net: stmmac: 16KB buffer must be 16 byte aligned
+
+From: Jose Abreu <Jose.Abreu@synopsys.com>
+
+commit 8605131747e7e1fd8f6c9f97a00287aae2b2c640 upstream.
+
+The 16KB RX Buffer must also be 16 byte aligned. Fix it.
+
+Fixes: 7ac6653a085b ("stmmac: Move the STMicroelectronics driver")
+Signed-off-by: Jose Abreu <Jose.Abreu@synopsys.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/net/ethernet/stmicro/stmmac/common.h |    5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+--- a/drivers/net/ethernet/stmicro/stmmac/common.h
++++ b/drivers/net/ethernet/stmicro/stmmac/common.h
+@@ -364,9 +364,8 @@ struct dma_features {
+       unsigned int arpoffsel;
+ };
+-/* GMAC TX FIFO is 8K, Rx FIFO is 16K */
+-#define BUF_SIZE_16KiB 16384
+-/* RX Buffer size must be < 8191 and multiple of 4/8/16 bytes */
++/* RX Buffer size must be multiple of 4/8/16 bytes */
++#define BUF_SIZE_16KiB 16368
+ #define BUF_SIZE_8KiB 8188
+ #define BUF_SIZE_4KiB 4096
+ #define BUF_SIZE_2KiB 2048
diff --git a/queue-5.4/net-stmmac-enable-16kb-buffer-size.patch b/queue-5.4/net-stmmac-enable-16kb-buffer-size.patch
new file mode 100644 (file)
index 0000000..c65ab71
--- /dev/null
@@ -0,0 +1,34 @@
+From b2f3a481c4cd62f78391b836b64c0a6e72b503d2 Mon Sep 17 00:00:00 2001
+From: Jose Abreu <Jose.Abreu@synopsys.com>
+Date: Wed, 18 Dec 2019 11:17:42 +0100
+Subject: net: stmmac: Enable 16KB buffer size
+
+From: Jose Abreu <Jose.Abreu@synopsys.com>
+
+commit b2f3a481c4cd62f78391b836b64c0a6e72b503d2 upstream.
+
+XGMAC supports maximum MTU that can go to 16KB. Lets add this check in
+the calculation of RX buffer size.
+
+Fixes: 7ac6653a085b ("stmmac: Move the STMicroelectronics driver")
+Signed-off-by: Jose Abreu <Jose.Abreu@synopsys.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
++++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+@@ -1108,7 +1108,9 @@ static int stmmac_set_bfsize(int mtu, in
+ {
+       int ret = bufsize;
+-      if (mtu >= BUF_SIZE_4KiB)
++      if (mtu >= BUF_SIZE_8KiB)
++              ret = BUF_SIZE_16KiB;
++      else if (mtu >= BUF_SIZE_4KiB)
+               ret = BUF_SIZE_8KiB;
+       else if (mtu >= BUF_SIZE_2KiB)
+               ret = BUF_SIZE_4KiB;
diff --git a/queue-5.4/reset-fix-of-devm-_reset_control_array_get-kerneldoc-return-types.patch b/queue-5.4/reset-fix-of-devm-_reset_control_array_get-kerneldoc-return-types.patch
new file mode 100644 (file)
index 0000000..1a9e9df
--- /dev/null
@@ -0,0 +1,46 @@
+From 723c0011c7f6992f57e2c629fa9c89141acc115f Mon Sep 17 00:00:00 2001
+From: Geert Uytterhoeven <geert+renesas@glider.be>
+Date: Wed, 20 Nov 2019 15:26:13 +0100
+Subject: reset: Fix {of,devm}_reset_control_array_get kerneldoc return types
+
+From: Geert Uytterhoeven <geert+renesas@glider.be>
+
+commit 723c0011c7f6992f57e2c629fa9c89141acc115f upstream.
+
+of_reset_control_array_get() and devm_reset_control_array_get() return
+struct reset_control pointers, not internal struct reset_control_array
+pointers, just like all other reset control API calls.
+
+Correct the kerneldoc to match the code.
+
+Fixes: 17c82e206d2a3cd8 ("reset: Add APIs to manage array of resets")
+Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
+Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/reset/core.c |    6 ++----
+ 1 file changed, 2 insertions(+), 4 deletions(-)
+
+--- a/drivers/reset/core.c
++++ b/drivers/reset/core.c
+@@ -861,8 +861,7 @@ static int of_reset_control_get_count(st
+  * @acquired: only one reset control may be acquired for a given controller
+  *            and ID
+  *
+- * Returns pointer to allocated reset_control_array on success or
+- * error on failure
++ * Returns pointer to allocated reset_control on success or error on failure
+  */
+ struct reset_control *
+ of_reset_control_array_get(struct device_node *np, bool shared, bool optional,
+@@ -915,8 +914,7 @@ EXPORT_SYMBOL_GPL(of_reset_control_array
+  * that just have to be asserted or deasserted, without any
+  * requirements on the order.
+  *
+- * Returns pointer to allocated reset_control_array on success or
+- * error on failure
++ * Returns pointer to allocated reset_control on success or error on failure
+  */
+ struct reset_control *
+ devm_reset_control_array_get(struct device *dev, bool shared, bool optional)
index d1378be9b41a2553320ba85f838bf1592ca2b9e8..65366b23e29f6b4de34f9f6be786d66c87a5c4db 100644 (file)
@@ -75,3 +75,30 @@ clk-samsung-exynos5420-keep-top-g3d-clocks-enabled.patch
 perf-hists-fix-variable-name-s-inconsistency-in-hists__for_each-macro.patch
 locking-lockdep-fix-buffer-overrun-problem-in-stack_trace.patch
 perf-report-fix-incorrectly-added-dimensions-as-switch-perf-data-file.patch
+mm-shmem.c-thp-shmem-fix-conflict-of-above-47bit-hint-address-and-pmd-alignment.patch
+mm-huge_memory.c-thp-fix-conflict-of-above-47bit-hint-address-and-pmd-alignment.patch
+mm-memcg-slab-fix-percpu-slab-vmstats-flushing.patch
+mm-memcg-slab-call-flush_memcg_workqueue-only-if-memcg-workqueue-is-valid.patch
+mm-debug_pagealloc-don-t-rely-on-static-keys-too-early.patch
+btrfs-fix-invalid-removal-of-root-ref.patch
+btrfs-do-not-delete-mismatched-root-refs.patch
+btrfs-relocation-fix-reloc_root-lifespan-and-access.patch
+btrfs-fix-memory-leak-in-qgroup-accounting.patch
+btrfs-check-rw_devices-not-num_devices-for-balance.patch
+btrfs-always-copy-scrub-arguments-back-to-user-space.patch
+mm-memory_hotplug-don-t-free-usage-map-when-removing-a-re-added-early-section.patch
+mm-page-writeback.c-avoid-potential-division-by-zero-in-wb_min_max_ratio.patch
+mm-khugepaged-add-trace-status-description-for-scan_page_has_private.patch
+arm-dts-imx6qdl-sabresd-remove-incorrect-power-supply-assignment.patch
+arm-dts-imx6sx-sdb-remove-incorrect-power-supply-assignment.patch
+arm-dts-imx6sl-evk-remove-incorrect-power-supply-assignment.patch
+arm-dts-imx6sll-evk-remove-incorrect-power-supply-assignment.patch
+arm-dts-imx6q-icore-mipi-use-1.5-version-of-i.core-mx6dl.patch
+arm-dts-imx7-fix-toradex-colibri-imx7s-256mb-nand-flash-support.patch
+net-stmmac-16kb-buffer-must-be-16-byte-aligned.patch
+net-stmmac-enable-16kb-buffer-size.patch
+reset-fix-of-devm-_reset_control_array_get-kerneldoc-return-types.patch
+tipc-fix-potential-hanging-after-b-rcast-changing.patch
+tipc-fix-retrans-failure-due-to-wrong-destination.patch
+net-fix-kernel-doc-warning-in-linux-netdevice.h.patch
+block-fix-the-type-of-sts-in-bsg_queue_rq.patch
diff --git a/queue-5.4/tipc-fix-potential-hanging-after-b-rcast-changing.patch b/queue-5.4/tipc-fix-potential-hanging-after-b-rcast-changing.patch
new file mode 100644 (file)
index 0000000..4d24c7d
--- /dev/null
@@ -0,0 +1,105 @@
+From dca4a17d24ee9d878836ce5eb8dc25be1ffa5729 Mon Sep 17 00:00:00 2001
+From: Tuong Lien <tuong.t.lien@dektech.com.au>
+Date: Tue, 10 Dec 2019 15:21:03 +0700
+Subject: tipc: fix potential hanging after b/rcast changing
+
+From: Tuong Lien <tuong.t.lien@dektech.com.au>
+
+commit dca4a17d24ee9d878836ce5eb8dc25be1ffa5729 upstream.
+
+In commit c55c8edafa91 ("tipc: smooth change between replicast and
+broadcast"), we allow instant switching between replicast and broadcast
+by sending a dummy 'SYN' packet on the last used link to synchronize
+packets on the links. The 'SYN' message is an object of link congestion
+also, so if that happens, a 'SOCK_WAKEUP' will be scheduled to be sent
+back to the socket...
+However, in that commit, we simply use the same socket 'cong_link_cnt'
+counter for both the 'SYN' & normal payload message sending. Therefore,
+if both the replicast & broadcast links are congested, the counter will
+be not updated correctly but overwritten by the latter congestion.
+Later on, when the 'SOCK_WAKEUP' messages are processed, the counter is
+reduced one by one and eventually overflowed. Consequently, further
+activities on the socket will only wait for the false congestion signal
+to disappear but never been met.
+
+Because sending the 'SYN' message is vital for the mechanism, it should
+be done anyway. This commit fixes the issue by marking the message with
+an error code e.g. 'TIPC_ERR_NO_PORT', so its sending should not face a
+link congestion, there is no need to touch the socket 'cong_link_cnt'
+either. In addition, in the event of any error (e.g. -ENOBUFS), we will
+purge the entire payload message queue and make a return immediately.
+
+Fixes: c55c8edafa91 ("tipc: smooth change between replicast and broadcast")
+Acked-by: Jon Maloy <jon.maloy@ericsson.com>
+Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/tipc/bcast.c |   24 +++++++++++++++---------
+ 1 file changed, 15 insertions(+), 9 deletions(-)
+
+--- a/net/tipc/bcast.c
++++ b/net/tipc/bcast.c
+@@ -305,17 +305,17 @@ static int tipc_rcast_xmit(struct net *n
+  * @skb: socket buffer to copy
+  * @method: send method to be used
+  * @dests: destination nodes for message.
+- * @cong_link_cnt: returns number of encountered congested destination links
+  * Returns 0 if success, otherwise errno
+  */
+ static int tipc_mcast_send_sync(struct net *net, struct sk_buff *skb,
+                               struct tipc_mc_method *method,
+-                              struct tipc_nlist *dests,
+-                              u16 *cong_link_cnt)
++                              struct tipc_nlist *dests)
+ {
+       struct tipc_msg *hdr, *_hdr;
+       struct sk_buff_head tmpq;
+       struct sk_buff *_skb;
++      u16 cong_link_cnt;
++      int rc = 0;
+       /* Is a cluster supporting with new capabilities ? */
+       if (!(tipc_net(net)->capabilities & TIPC_MCAST_RBCTL))
+@@ -343,18 +343,19 @@ static int tipc_mcast_send_sync(struct n
+       _hdr = buf_msg(_skb);
+       msg_set_size(_hdr, MCAST_H_SIZE);
+       msg_set_is_rcast(_hdr, !msg_is_rcast(hdr));
++      msg_set_errcode(_hdr, TIPC_ERR_NO_PORT);
+       __skb_queue_head_init(&tmpq);
+       __skb_queue_tail(&tmpq, _skb);
+       if (method->rcast)
+-              tipc_bcast_xmit(net, &tmpq, cong_link_cnt);
++              rc = tipc_bcast_xmit(net, &tmpq, &cong_link_cnt);
+       else
+-              tipc_rcast_xmit(net, &tmpq, dests, cong_link_cnt);
++              rc = tipc_rcast_xmit(net, &tmpq, dests, &cong_link_cnt);
+       /* This queue should normally be empty by now */
+       __skb_queue_purge(&tmpq);
+-      return 0;
++      return rc;
+ }
+ /* tipc_mcast_xmit - deliver message to indicated destination nodes
+@@ -396,9 +397,14 @@ int tipc_mcast_xmit(struct net *net, str
+               msg_set_is_rcast(hdr, method->rcast);
+               /* Switch method ? */
+-              if (rcast != method->rcast)
+-                      tipc_mcast_send_sync(net, skb, method,
+-                                           dests, cong_link_cnt);
++              if (rcast != method->rcast) {
++                      rc = tipc_mcast_send_sync(net, skb, method, dests);
++                      if (unlikely(rc)) {
++                              pr_err("Unable to send SYN: method %d, rc %d\n",
++                                     rcast, rc);
++                              goto exit;
++                      }
++              }
+               if (method->rcast)
+                       rc = tipc_rcast_xmit(net, pkts, dests, cong_link_cnt);
diff --git a/queue-5.4/tipc-fix-retrans-failure-due-to-wrong-destination.patch b/queue-5.4/tipc-fix-retrans-failure-due-to-wrong-destination.patch
new file mode 100644 (file)
index 0000000..4da666b
--- /dev/null
@@ -0,0 +1,100 @@
+From abc9b4e0549b93fdaff56e9532bc49a2d7b04955 Mon Sep 17 00:00:00 2001
+From: Tuong Lien <tuong.t.lien@dektech.com.au>
+Date: Tue, 10 Dec 2019 15:21:04 +0700
+Subject: tipc: fix retrans failure due to wrong destination
+
+From: Tuong Lien <tuong.t.lien@dektech.com.au>
+
+commit abc9b4e0549b93fdaff56e9532bc49a2d7b04955 upstream.
+
+When a user message is sent, TIPC will check if the socket has faced a
+congestion at link layer. If that happens, it will make a sleep to wait
+for the congestion to disappear. This leaves a gap for other users to
+take over the socket (e.g. multi threads) since the socket is released
+as well. Also, in case of connectionless (e.g. SOCK_RDM), user is free
+to send messages to various destinations (e.g. via 'sendto()'), then
+the socket's preformatted header has to be updated correspondingly
+prior to the actual payload message building.
+
+Unfortunately, the latter action is done before the first action which
+causes a condition issue that the destination of a certain message can
+be modified incorrectly in the middle, leading to wrong destination
+when that message is built. Consequently, when the message is sent to
+the link layer, it gets stuck there forever because the peer node will
+simply reject it. After a number of retransmission attempts, the link
+is eventually taken down and the retransmission failure is reported.
+
+This commit fixes the problem by rearranging the order of actions to
+prevent the race condition from occurring, so the message building is
+'atomic' and its header will not be modified by anyone.
+
+Fixes: 365ad353c256 ("tipc: reduce risk of user starvation during link congestion")
+Acked-by: Jon Maloy <jon.maloy@ericsson.com>
+Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/tipc/socket.c |   32 ++++++++++++++++++--------------
+ 1 file changed, 18 insertions(+), 14 deletions(-)
+
+--- a/net/tipc/socket.c
++++ b/net/tipc/socket.c
+@@ -1306,8 +1306,8 @@ static int __tipc_sendmsg(struct socket
+       struct tipc_msg *hdr = &tsk->phdr;
+       struct tipc_name_seq *seq;
+       struct sk_buff_head pkts;
+-      u32 dport, dnode = 0;
+-      u32 type, inst;
++      u32 dport = 0, dnode = 0;
++      u32 type = 0, inst = 0;
+       int mtu, rc;
+       if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE))
+@@ -1360,23 +1360,11 @@ static int __tipc_sendmsg(struct socket
+               type = dest->addr.name.name.type;
+               inst = dest->addr.name.name.instance;
+               dnode = dest->addr.name.domain;
+-              msg_set_type(hdr, TIPC_NAMED_MSG);
+-              msg_set_hdr_sz(hdr, NAMED_H_SIZE);
+-              msg_set_nametype(hdr, type);
+-              msg_set_nameinst(hdr, inst);
+-              msg_set_lookup_scope(hdr, tipc_node2scope(dnode));
+               dport = tipc_nametbl_translate(net, type, inst, &dnode);
+-              msg_set_destnode(hdr, dnode);
+-              msg_set_destport(hdr, dport);
+               if (unlikely(!dport && !dnode))
+                       return -EHOSTUNREACH;
+       } else if (dest->addrtype == TIPC_ADDR_ID) {
+               dnode = dest->addr.id.node;
+-              msg_set_type(hdr, TIPC_DIRECT_MSG);
+-              msg_set_lookup_scope(hdr, 0);
+-              msg_set_destnode(hdr, dnode);
+-              msg_set_destport(hdr, dest->addr.id.ref);
+-              msg_set_hdr_sz(hdr, BASIC_H_SIZE);
+       } else {
+               return -EINVAL;
+       }
+@@ -1387,6 +1375,22 @@ static int __tipc_sendmsg(struct socket
+       if (unlikely(rc))
+               return rc;
++      if (dest->addrtype == TIPC_ADDR_NAME) {
++              msg_set_type(hdr, TIPC_NAMED_MSG);
++              msg_set_hdr_sz(hdr, NAMED_H_SIZE);
++              msg_set_nametype(hdr, type);
++              msg_set_nameinst(hdr, inst);
++              msg_set_lookup_scope(hdr, tipc_node2scope(dnode));
++              msg_set_destnode(hdr, dnode);
++              msg_set_destport(hdr, dport);
++      } else { /* TIPC_ADDR_ID */
++              msg_set_type(hdr, TIPC_DIRECT_MSG);
++              msg_set_lookup_scope(hdr, 0);
++              msg_set_destnode(hdr, dnode);
++              msg_set_destport(hdr, dest->addr.id.ref);
++              msg_set_hdr_sz(hdr, BASIC_H_SIZE);
++      }
++
+       __skb_queue_head_init(&pkts);
+       mtu = tipc_node_get_mtu(net, dnode, tsk->portid);
+       rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts);