--- /dev/null
+From 4a132f60808ae3a751e107a373f8572012352d3c Mon Sep 17 00:00:00 2001
+From: Jagan Teki <jagan@amarulasolutions.com>
+Date: Mon, 30 Dec 2019 17:30:19 +0530
+Subject: ARM: dts: imx6q-icore-mipi: Use 1.5 version of i.Core MX6DL
+
+From: Jagan Teki <jagan@amarulasolutions.com>
+
+commit 4a132f60808ae3a751e107a373f8572012352d3c upstream.
+
+The EDIMM STARTER KIT i.Core 1.5 MIPI Evaluation is based on
+the 1.5 version of the i.Core MX6 cpu module. The 1.5 version
+differs from the original one for a few details, including the
+ethernet PHY interface clock provider.
+
+With this commit, the ethernet interface works properly:
+SMSC LAN8710/LAN8720 2188000.ethernet-1:00: attached PHY driver
+
+While before using the 1.5 version, ethernet failed to startup
+do to un-clocked PHY interface:
+fec 2188000.ethernet eth0: could not attach to PHY
+
+Similar fix has merged for i.Core MX6Q but missed to update for DL.
+
+Fixes: a8039f2dd089 ("ARM: dts: imx6dl: Add Engicam i.CoreM6 1.5 Quad/Dual MIPI starter kit support")
+Cc: Jacopo Mondi <jacopo@jmondi.org>
+Signed-off-by: Michael Trimarchi <michael@amarulasolutions.com>
+Signed-off-by: Jagan Teki <jagan@amarulasolutions.com>
+Signed-off-by: Shawn Guo <shawnguo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm/boot/dts/imx6dl-icore-mipi.dts | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/arm/boot/dts/imx6dl-icore-mipi.dts
++++ b/arch/arm/boot/dts/imx6dl-icore-mipi.dts
+@@ -8,7 +8,7 @@
+ /dts-v1/;
+
+ #include "imx6dl.dtsi"
+-#include "imx6qdl-icore.dtsi"
++#include "imx6qdl-icore-1.5.dtsi"
+
+ / {
+ model = "Engicam i.CoreM6 DualLite/Solo MIPI Starter Kit";
--- /dev/null
+From 4521de30fbb3f5be0db58de93582ebce72c9d44f Mon Sep 17 00:00:00 2001
+From: Anson Huang <Anson.Huang@nxp.com>
+Date: Mon, 30 Dec 2019 09:41:07 +0800
+Subject: ARM: dts: imx6qdl-sabresd: Remove incorrect power supply assignment
+
+From: Anson Huang <Anson.Huang@nxp.com>
+
+commit 4521de30fbb3f5be0db58de93582ebce72c9d44f upstream.
+
+The vdd3p0 LDO's input should be from external USB VBUS directly, NOT
+PMIC's power supply, the vdd3p0 LDO's target output voltage can be
+controlled by SW, and it requires input voltage to be high enough, with
+incorrect power supply assigned, if the power supply's voltage is lower
+than the LDO target output voltage, it will return fail and skip the LDO
+voltage adjustment, so remove the power supply assignment for vdd3p0 to
+avoid such scenario.
+
+Fixes: 93385546ba36 ("ARM: dts: imx6qdl-sabresd: Assign corresponding power supply for LDOs")
+Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
+Signed-off-by: Shawn Guo <shawnguo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm/boot/dts/imx6qdl-sabresd.dtsi | 4 ----
+ 1 file changed, 4 deletions(-)
+
+--- a/arch/arm/boot/dts/imx6qdl-sabresd.dtsi
++++ b/arch/arm/boot/dts/imx6qdl-sabresd.dtsi
+@@ -749,10 +749,6 @@
+ vin-supply = <&vgen5_reg>;
+ };
+
+-®_vdd3p0 {
+- vin-supply = <&sw2_reg>;
+-};
+-
+ ®_vdd2p5 {
+ vin-supply = <&vgen5_reg>;
+ };
--- /dev/null
+From b4eb9ef0e29cd28c6fd684e0ab77bda824acb20e Mon Sep 17 00:00:00 2001
+From: Anson Huang <Anson.Huang@nxp.com>
+Date: Mon, 30 Dec 2019 09:41:09 +0800
+Subject: ARM: dts: imx6sl-evk: Remove incorrect power supply assignment
+
+From: Anson Huang <Anson.Huang@nxp.com>
+
+commit b4eb9ef0e29cd28c6fd684e0ab77bda824acb20e upstream.
+
+The vdd3p0 LDO's input should be from external USB VBUS directly, NOT
+PMIC's power supply, the vdd3p0 LDO's target output voltage can be
+controlled by SW, and it requires input voltage to be high enough, with
+incorrect power supply assigned, if the power supply's voltage is lower
+than the LDO target output voltage, it will return fail and skip the LDO
+voltage adjustment, so remove the power supply assignment for vdd3p0 to
+avoid such scenario.
+
+Fixes: 3feea8805d6f ("ARM: dts: imx6sl-evk: Assign corresponding power supply for LDOs")
+Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
+Signed-off-by: Shawn Guo <shawnguo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm/boot/dts/imx6sl-evk.dts | 4 ----
+ 1 file changed, 4 deletions(-)
+
+--- a/arch/arm/boot/dts/imx6sl-evk.dts
++++ b/arch/arm/boot/dts/imx6sl-evk.dts
+@@ -584,10 +584,6 @@
+ vin-supply = <&sw2_reg>;
+ };
+
+-®_vdd3p0 {
+- vin-supply = <&sw2_reg>;
+-};
+-
+ ®_vdd2p5 {
+ vin-supply = <&sw2_reg>;
+ };
--- /dev/null
+From 3479b2843c78ffb60247f522226ba68f93aee355 Mon Sep 17 00:00:00 2001
+From: Anson Huang <Anson.Huang@nxp.com>
+Date: Mon, 30 Dec 2019 09:41:10 +0800
+Subject: ARM: dts: imx6sll-evk: Remove incorrect power supply assignment
+
+From: Anson Huang <Anson.Huang@nxp.com>
+
+commit 3479b2843c78ffb60247f522226ba68f93aee355 upstream.
+
+The vdd3p0 LDO's input should be from external USB VBUS directly, NOT
+PMIC's power supply, the vdd3p0 LDO's target output voltage can be
+controlled by SW, and it requires input voltage to be high enough, with
+incorrect power supply assigned, if the power supply's voltage is lower
+than the LDO target output voltage, it will return fail and skip the LDO
+voltage adjustment, so remove the power supply assignment for vdd3p0 to
+avoid such scenario.
+
+Fixes: 96a9169cf621 ("ARM: dts: imx6sll-evk: Assign corresponding power supply for vdd3p0")
+Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
+Signed-off-by: Shawn Guo <shawnguo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm/boot/dts/imx6sll-evk.dts | 4 ----
+ 1 file changed, 4 deletions(-)
+
+--- a/arch/arm/boot/dts/imx6sll-evk.dts
++++ b/arch/arm/boot/dts/imx6sll-evk.dts
+@@ -265,10 +265,6 @@
+ status = "okay";
+ };
+
+-®_3p0 {
+- vin-supply = <&sw2_reg>;
+-};
+-
+ &snvs_poweroff {
+ status = "okay";
+ };
--- /dev/null
+From d4918ebb5c256d26696a13e78ac68c146111191a Mon Sep 17 00:00:00 2001
+From: Anson Huang <Anson.Huang@nxp.com>
+Date: Mon, 30 Dec 2019 09:41:08 +0800
+Subject: ARM: dts: imx6sx-sdb: Remove incorrect power supply assignment
+
+From: Anson Huang <Anson.Huang@nxp.com>
+
+commit d4918ebb5c256d26696a13e78ac68c146111191a upstream.
+
+The vdd3p0 LDO's input should be from external USB VBUS directly, NOT
+PMIC's power supply, the vdd3p0 LDO's target output voltage can be
+controlled by SW, and it requires input voltage to be high enough, with
+incorrect power supply assigned, if the power supply's voltage is lower
+than the LDO target output voltage, it will return fail and skip the LDO
+voltage adjustment, so remove the power supply assignment for vdd3p0 to
+avoid such scenario.
+
+Fixes: 37a4bdead109 ("ARM: dts: imx6sx-sdb: Assign corresponding power supply for LDOs")
+Signed-off-by: Anson Huang <Anson.Huang@nxp.com>
+Signed-off-by: Shawn Guo <shawnguo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm/boot/dts/imx6sx-sdb-reva.dts | 4 ----
+ arch/arm/boot/dts/imx6sx-sdb.dts | 4 ----
+ 2 files changed, 8 deletions(-)
+
+--- a/arch/arm/boot/dts/imx6sx-sdb-reva.dts
++++ b/arch/arm/boot/dts/imx6sx-sdb-reva.dts
+@@ -159,10 +159,6 @@
+ vin-supply = <&vgen6_reg>;
+ };
+
+-®_vdd3p0 {
+- vin-supply = <&sw2_reg>;
+-};
+-
+ ®_vdd2p5 {
+ vin-supply = <&vgen6_reg>;
+ };
+--- a/arch/arm/boot/dts/imx6sx-sdb.dts
++++ b/arch/arm/boot/dts/imx6sx-sdb.dts
+@@ -141,10 +141,6 @@
+ vin-supply = <&vgen6_reg>;
+ };
+
+-®_vdd3p0 {
+- vin-supply = <&sw2_reg>;
+-};
+-
+ ®_vdd2p5 {
+ vin-supply = <&vgen6_reg>;
+ };
--- /dev/null
+From 4b0b97e651ecf29f20248420b52b6864fbd40bc2 Mon Sep 17 00:00:00 2001
+From: Marcel Ziswiler <marcel.ziswiler@toradex.com>
+Date: Wed, 8 Jan 2020 17:12:31 +0100
+Subject: ARM: dts: imx7: Fix Toradex Colibri iMX7S 256MB NAND flash support
+
+From: Marcel Ziswiler <marcel.ziswiler@toradex.com>
+
+commit 4b0b97e651ecf29f20248420b52b6864fbd40bc2 upstream.
+
+Turns out when introducing the eMMC version the gpmi node required for
+NAND flash support got enabled exclusively on Colibri iMX7D 512MB.
+
+Fixes: f928a4a377e4 ("ARM: dts: imx7: add Toradex Colibri iMX7D 1GB (eMMC) support")
+Signed-off-by: Marcel Ziswiler <marcel.ziswiler@toradex.com>
+Signed-off-by: Shawn Guo <shawnguo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm/boot/dts/imx7s-colibri.dtsi | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/arch/arm/boot/dts/imx7s-colibri.dtsi
++++ b/arch/arm/boot/dts/imx7s-colibri.dtsi
+@@ -49,3 +49,7 @@
+ reg = <0x80000000 0x10000000>;
+ };
+ };
++
++&gpmi {
++ status = "okay";
++};
--- /dev/null
+From c44a4edb20938c85b64a256661443039f5bffdea Mon Sep 17 00:00:00 2001
+From: Bart Van Assche <bvanassche@acm.org>
+Date: Tue, 17 Dec 2019 16:23:29 -0800
+Subject: block: Fix the type of 'sts' in bsg_queue_rq()
+
+From: Bart Van Assche <bvanassche@acm.org>
+
+commit c44a4edb20938c85b64a256661443039f5bffdea upstream.
+
+This patch fixes the following sparse warnings:
+
+block/bsg-lib.c:269:19: warning: incorrect type in initializer (different base types)
+block/bsg-lib.c:269:19: expected int sts
+block/bsg-lib.c:269:19: got restricted blk_status_t [usertype]
+block/bsg-lib.c:286:16: warning: incorrect type in return expression (different base types)
+block/bsg-lib.c:286:16: expected restricted blk_status_t
+block/bsg-lib.c:286:16: got int [assigned] sts
+
+Cc: Martin Wilck <mwilck@suse.com>
+Fixes: d46fe2cb2dce ("block: drop device references in bsg_queue_rq()")
+Signed-off-by: Bart Van Assche <bvanassche@acm.org>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ block/bsg-lib.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/block/bsg-lib.c
++++ b/block/bsg-lib.c
+@@ -266,7 +266,7 @@ static blk_status_t bsg_queue_rq(struct
+ struct request *req = bd->rq;
+ struct bsg_set *bset =
+ container_of(q->tag_set, struct bsg_set, tag_set);
+- int sts = BLK_STS_IOERR;
++ blk_status_t sts = BLK_STS_IOERR;
+ int ret;
+
+ blk_mq_start_request(req);
--- /dev/null
+From 5afe6ce748c1ea99e0d648153c05075e1ab93afb Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Thu, 16 Jan 2020 11:29:20 +0000
+Subject: Btrfs: always copy scrub arguments back to user space
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 5afe6ce748c1ea99e0d648153c05075e1ab93afb upstream.
+
+If scrub returns an error we are not copying back the scrub arguments
+structure to user space. This prevents user space to know how much
+progress scrub has done if an error happened - this includes -ECANCELED
+which is returned when users ask for scrub to stop. A particular use
+case, which is used in btrfs-progs, is to resume scrub after it is
+canceled, in that case it relies on checking the progress from the scrub
+arguments structure and then use that progress in a call to resume
+scrub.
+
+So fix this by always copying the scrub arguments structure to user
+space, overwriting the value returned to user space with -EFAULT only if
+copying the structure failed to let user space know that either that
+copying did not happen, and therefore the structure is stale, or it
+happened partially and the structure is probably not valid and corrupt
+due to the partial copy.
+
+Reported-by: Graham Cobb <g.btrfs@cobb.uk.net>
+Link: https://lore.kernel.org/linux-btrfs/d0a97688-78be-08de-ca7d-bcb4c7fb397e@cobb.uk.net/
+Fixes: 06fe39ab15a6a4 ("Btrfs: do not overwrite scrub error with fault error in scrub ioctl")
+CC: stable@vger.kernel.org # 5.1+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Tested-by: Graham Cobb <g.btrfs@cobb.uk.net>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/ioctl.c | 14 +++++++++++++-
+ 1 file changed, 13 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -4254,7 +4254,19 @@ static long btrfs_ioctl_scrub(struct fil
+ &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
+ 0);
+
+- if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
++ /*
++ * Copy scrub args to user space even if btrfs_scrub_dev() returned an
++ * error. This is important as it allows user space to know how much
++ * progress scrub has done. For example, if scrub is canceled we get
++ * -ECANCELED from btrfs_scrub_dev() and return that error back to user
++ * space. Later user space can inspect the progress from the structure
++ * btrfs_ioctl_scrub_args and resume scrub from where it left off
++ * previously (btrfs-progs does this).
++ * If we fail to copy the btrfs_ioctl_scrub_args structure to user space
++ * then return -EFAULT to signal the structure was not copied or it may
++ * be corrupt and unreliable due to a partial copy.
++ */
++ if (copy_to_user(arg, sa, sizeof(*sa)))
+ ret = -EFAULT;
+
+ if (!(sa->flags & BTRFS_SCRUB_READONLY))
--- /dev/null
+From b35cf1f0bf1f2b0b193093338414b9bd63b29015 Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Fri, 10 Jan 2020 11:11:24 -0500
+Subject: btrfs: check rw_devices, not num_devices for balance
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit b35cf1f0bf1f2b0b193093338414b9bd63b29015 upstream.
+
+The fstest btrfs/154 reports
+
+ [ 8675.381709] BTRFS: Transaction aborted (error -28)
+ [ 8675.383302] WARNING: CPU: 1 PID: 31900 at fs/btrfs/block-group.c:2038 btrfs_create_pending_block_groups+0x1e0/0x1f0 [btrfs]
+ [ 8675.390925] CPU: 1 PID: 31900 Comm: btrfs Not tainted 5.5.0-rc6-default+ #935
+ [ 8675.392780] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014
+ [ 8675.395452] RIP: 0010:btrfs_create_pending_block_groups+0x1e0/0x1f0 [btrfs]
+ [ 8675.402672] RSP: 0018:ffffb2090888fb00 EFLAGS: 00010286
+ [ 8675.404413] RAX: 0000000000000000 RBX: ffff92026dfa91c8 RCX: 0000000000000001
+ [ 8675.406609] RDX: 0000000000000000 RSI: ffffffff8e100899 RDI: ffffffff8e100971
+ [ 8675.408775] RBP: ffff920247c61660 R08: 0000000000000000 R09: 0000000000000000
+ [ 8675.410978] R10: 0000000000000000 R11: 0000000000000000 R12: 00000000ffffffe4
+ [ 8675.412647] R13: ffff92026db74000 R14: ffff920247c616b8 R15: ffff92026dfbc000
+ [ 8675.413994] FS: 00007fd5e57248c0(0000) GS:ffff92027d800000(0000) knlGS:0000000000000000
+ [ 8675.416146] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+ [ 8675.417833] CR2: 0000564aa51682d8 CR3: 000000006dcbc004 CR4: 0000000000160ee0
+ [ 8675.419801] Call Trace:
+ [ 8675.420742] btrfs_start_dirty_block_groups+0x355/0x480 [btrfs]
+ [ 8675.422600] btrfs_commit_transaction+0xc8/0xaf0 [btrfs]
+ [ 8675.424335] reset_balance_state+0x14a/0x190 [btrfs]
+ [ 8675.425824] btrfs_balance.cold+0xe7/0x154 [btrfs]
+ [ 8675.427313] ? kmem_cache_alloc_trace+0x235/0x2c0
+ [ 8675.428663] btrfs_ioctl_balance+0x298/0x350 [btrfs]
+ [ 8675.430285] btrfs_ioctl+0x466/0x2550 [btrfs]
+ [ 8675.431788] ? mem_cgroup_charge_statistics+0x51/0xf0
+ [ 8675.433487] ? mem_cgroup_commit_charge+0x56/0x400
+ [ 8675.435122] ? do_raw_spin_unlock+0x4b/0xc0
+ [ 8675.436618] ? _raw_spin_unlock+0x1f/0x30
+ [ 8675.438093] ? __handle_mm_fault+0x499/0x740
+ [ 8675.439619] ? do_vfs_ioctl+0x56e/0x770
+ [ 8675.441034] do_vfs_ioctl+0x56e/0x770
+ [ 8675.442411] ksys_ioctl+0x3a/0x70
+ [ 8675.443718] ? trace_hardirqs_off_thunk+0x1a/0x1c
+ [ 8675.445333] __x64_sys_ioctl+0x16/0x20
+ [ 8675.446705] do_syscall_64+0x50/0x210
+ [ 8675.448059] entry_SYSCALL_64_after_hwframe+0x49/0xbe
+ [ 8675.479187] BTRFS: error (device vdb) in btrfs_create_pending_block_groups:2038: errno=-28 No space left
+
+We now use btrfs_can_overcommit() to see if we can flip a block group
+read only. Before this would fail because we weren't taking into
+account the usable un-allocated space for allocating chunks. With my
+patches we were allowed to do the balance, which is technically correct.
+
+The test is trying to start balance on degraded mount. So now we're
+trying to allocate a chunk and cannot because we want to allocate a
+RAID1 chunk, but there's only 1 device that's available for usage. This
+results in an ENOSPC.
+
+But we shouldn't even be making it this far, we don't have enough
+devices to restripe. The problem is we're using btrfs_num_devices(),
+that also includes missing devices. That's not actually what we want, we
+need to use rw_devices.
+
+The chunk_mutex is not needed here, rw_devices changes only in device
+add, remove or replace, all are excluded by EXCL_OP mechanism.
+
+Fixes: e4d8ec0f65b9 ("Btrfs: implement online profile changing")
+CC: stable@vger.kernel.org # 4.4+
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+[ add stacktrace, update changelog, drop chunk_mutex ]
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/volumes.c | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -4066,7 +4066,11 @@ int btrfs_balance(struct btrfs_fs_info *
+ }
+ }
+
+- num_devices = btrfs_num_devices(fs_info);
++ /*
++ * rw_devices will not change at the moment, device add/delete/replace
++ * are excluded by EXCL_OP
++ */
++ num_devices = fs_info->fs_devices->rw_devices;
+
+ /*
+ * SINGLE profile on-disk has no profile bit, but in-memory we have a
--- /dev/null
+From 423a716cd7be16fb08690760691befe3be97d3fc Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Wed, 18 Dec 2019 17:20:29 -0500
+Subject: btrfs: do not delete mismatched root refs
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit 423a716cd7be16fb08690760691befe3be97d3fc upstream.
+
+btrfs_del_root_ref() will simply WARN_ON() if the ref doesn't match in
+any way, and then continue to delete the reference. This shouldn't
+happen, we have these values because there's more to the reference than
+the original root and the sub root. If any of these checks fail, return
+-ENOENT.
+
+CC: stable@vger.kernel.org # 4.4+
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/root-tree.c | 10 ++++++----
+ 1 file changed, 6 insertions(+), 4 deletions(-)
+
+--- a/fs/btrfs/root-tree.c
++++ b/fs/btrfs/root-tree.c
+@@ -376,11 +376,13 @@ again:
+ leaf = path->nodes[0];
+ ref = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_root_ref);
+-
+- WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
+- WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
+ ptr = (unsigned long)(ref + 1);
+- WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
++ if ((btrfs_root_ref_dirid(leaf, ref) != dirid) ||
++ (btrfs_root_ref_name_len(leaf, ref) != name_len) ||
++ memcmp_extent_buffer(leaf, name, ptr, name_len)) {
++ err = -ENOENT;
++ goto out;
++ }
+ *sequence = btrfs_root_ref_sequence(leaf, ref);
+
+ ret = btrfs_del_item(trans, tree_root, path);
--- /dev/null
+From d49d3287e74ffe55ae7430d1e795e5f9bf7359ea Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Wed, 18 Dec 2019 17:20:28 -0500
+Subject: btrfs: fix invalid removal of root ref
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit d49d3287e74ffe55ae7430d1e795e5f9bf7359ea upstream.
+
+If we have the following sequence of events
+
+ btrfs sub create A
+ btrfs sub create A/B
+ btrfs sub snap A C
+ mkdir C/foo
+ mv A/B C/foo
+ rm -rf *
+
+We will end up with a transaction abort.
+
+The reason for this is because we create a root ref for B pointing to A.
+When we create a snapshot of C we still have B in our tree, but because
+the root ref points to A and not C we will make it appear to be empty.
+
+The problem happens when we move B into C. This removes the root ref
+for B pointing to A and adds a ref of B pointing to C. When we rmdir C
+we'll see that we have a ref to our root and remove the root ref,
+despite not actually matching our reference name.
+
+Now btrfs_del_root_ref() allowing this to work is a bug as well, however
+we know that this inode does not actually point to a root ref in the
+first place, so we shouldn't be calling btrfs_del_root_ref() in the
+first place and instead simply look up our dir index for this item and
+do the rest of the removal.
+
+CC: stable@vger.kernel.org # 4.4+
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/inode.c | 27 +++++++++++++++++++--------
+ 1 file changed, 19 insertions(+), 8 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -4248,13 +4248,16 @@ static int btrfs_unlink_subvol(struct bt
+ }
+ btrfs_release_path(path);
+
+- ret = btrfs_del_root_ref(trans, objectid, root->root_key.objectid,
+- dir_ino, &index, name, name_len);
+- if (ret < 0) {
+- if (ret != -ENOENT) {
+- btrfs_abort_transaction(trans, ret);
+- goto out;
+- }
++ /*
++ * This is a placeholder inode for a subvolume we didn't have a
++ * reference to at the time of the snapshot creation. In the meantime
++ * we could have renamed the real subvol link into our snapshot, so
++ * depending on btrfs_del_root_ref to return -ENOENT here is incorret.
++ * Instead simply lookup the dir_index_item for this entry so we can
++ * remove it. Otherwise we know we have a ref to the root and we can
++ * call btrfs_del_root_ref, and it _shouldn't_ fail.
++ */
++ if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
+ di = btrfs_search_dir_index_item(root, path, dir_ino,
+ name, name_len);
+ if (IS_ERR_OR_NULL(di)) {
+@@ -4269,8 +4272,16 @@ static int btrfs_unlink_subvol(struct bt
+ leaf = path->nodes[0];
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ index = key.offset;
++ btrfs_release_path(path);
++ } else {
++ ret = btrfs_del_root_ref(trans, objectid,
++ root->root_key.objectid, dir_ino,
++ &index, name, name_len);
++ if (ret) {
++ btrfs_abort_transaction(trans, ret);
++ goto out;
++ }
+ }
+- btrfs_release_path(path);
+
+ ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index);
+ if (ret) {
--- /dev/null
+From 26ef8493e1ab771cb01d27defca2fa1315dc3980 Mon Sep 17 00:00:00 2001
+From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Date: Wed, 8 Jan 2020 21:07:32 +0900
+Subject: btrfs: fix memory leak in qgroup accounting
+
+From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+
+commit 26ef8493e1ab771cb01d27defca2fa1315dc3980 upstream.
+
+When running xfstests on the current btrfs I get the following splat from
+kmemleak:
+
+unreferenced object 0xffff88821b2404e0 (size 32):
+ comm "kworker/u4:7", pid 26663, jiffies 4295283698 (age 8.776s)
+ hex dump (first 32 bytes):
+ 01 00 00 00 00 00 00 00 10 ff fd 26 82 88 ff ff ...........&....
+ 10 ff fd 26 82 88 ff ff 20 ff fd 26 82 88 ff ff ...&.... ..&....
+ backtrace:
+ [<00000000f94fd43f>] ulist_alloc+0x25/0x60 [btrfs]
+ [<00000000fd023d99>] btrfs_find_all_roots_safe+0x41/0x100 [btrfs]
+ [<000000008f17bd32>] btrfs_find_all_roots+0x52/0x70 [btrfs]
+ [<00000000b7660afb>] btrfs_qgroup_rescan_worker+0x343/0x680 [btrfs]
+ [<0000000058e66778>] btrfs_work_helper+0xac/0x1e0 [btrfs]
+ [<00000000f0188930>] process_one_work+0x1cf/0x350
+ [<00000000af5f2f8e>] worker_thread+0x28/0x3c0
+ [<00000000b55a1add>] kthread+0x109/0x120
+ [<00000000f88cbd17>] ret_from_fork+0x35/0x40
+
+This corresponds to:
+
+ (gdb) l *(btrfs_find_all_roots_safe+0x41)
+ 0x8d7e1 is in btrfs_find_all_roots_safe (fs/btrfs/backref.c:1413).
+ 1408
+ 1409 tmp = ulist_alloc(GFP_NOFS);
+ 1410 if (!tmp)
+ 1411 return -ENOMEM;
+ 1412 *roots = ulist_alloc(GFP_NOFS);
+ 1413 if (!*roots) {
+ 1414 ulist_free(tmp);
+ 1415 return -ENOMEM;
+ 1416 }
+ 1417
+
+Following the lifetime of the allocated 'roots' ulist, it gets freed
+again in btrfs_qgroup_account_extent().
+
+But this does not happen if the function is called with the
+'BTRFS_FS_QUOTA_ENABLED' flag cleared, then btrfs_qgroup_account_extent()
+does a short leave and directly returns.
+
+Instead of directly returning we should jump to the 'out_free' in order to
+free all resources as expected.
+
+CC: stable@vger.kernel.org # 4.14+
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+[ add comment ]
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/qgroup.c | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -2423,8 +2423,12 @@ int btrfs_qgroup_account_extent(struct b
+ u64 nr_old_roots = 0;
+ int ret = 0;
+
++ /*
++ * If quotas get disabled meanwhile, the resouces need to be freed and
++ * we can't just exit here.
++ */
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+- return 0;
++ goto out_free;
+
+ if (new_roots) {
+ if (!maybe_fs_roots(new_roots))
--- /dev/null
+From 6282675e6708ec78518cc0e9ad1f1f73d7c5c53d Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Wed, 8 Jan 2020 13:12:00 +0800
+Subject: btrfs: relocation: fix reloc_root lifespan and access
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 6282675e6708ec78518cc0e9ad1f1f73d7c5c53d upstream.
+
+[BUG]
+There are several different KASAN reports for balance + snapshot
+workloads. Involved call paths include:
+
+ should_ignore_root+0x54/0xb0 [btrfs]
+ build_backref_tree+0x11af/0x2280 [btrfs]
+ relocate_tree_blocks+0x391/0xb80 [btrfs]
+ relocate_block_group+0x3e5/0xa00 [btrfs]
+ btrfs_relocate_block_group+0x240/0x4d0 [btrfs]
+ btrfs_relocate_chunk+0x53/0xf0 [btrfs]
+ btrfs_balance+0xc91/0x1840 [btrfs]
+ btrfs_ioctl_balance+0x416/0x4e0 [btrfs]
+ btrfs_ioctl+0x8af/0x3e60 [btrfs]
+ do_vfs_ioctl+0x831/0xb10
+
+ create_reloc_root+0x9f/0x460 [btrfs]
+ btrfs_reloc_post_snapshot+0xff/0x6c0 [btrfs]
+ create_pending_snapshot+0xa9b/0x15f0 [btrfs]
+ create_pending_snapshots+0x111/0x140 [btrfs]
+ btrfs_commit_transaction+0x7a6/0x1360 [btrfs]
+ btrfs_mksubvol+0x915/0x960 [btrfs]
+ btrfs_ioctl_snap_create_transid+0x1d5/0x1e0 [btrfs]
+ btrfs_ioctl_snap_create_v2+0x1d3/0x270 [btrfs]
+ btrfs_ioctl+0x241b/0x3e60 [btrfs]
+ do_vfs_ioctl+0x831/0xb10
+
+ btrfs_reloc_pre_snapshot+0x85/0xc0 [btrfs]
+ create_pending_snapshot+0x209/0x15f0 [btrfs]
+ create_pending_snapshots+0x111/0x140 [btrfs]
+ btrfs_commit_transaction+0x7a6/0x1360 [btrfs]
+ btrfs_mksubvol+0x915/0x960 [btrfs]
+ btrfs_ioctl_snap_create_transid+0x1d5/0x1e0 [btrfs]
+ btrfs_ioctl_snap_create_v2+0x1d3/0x270 [btrfs]
+ btrfs_ioctl+0x241b/0x3e60 [btrfs]
+ do_vfs_ioctl+0x831/0xb10
+
+[CAUSE]
+All these call sites are only relying on root->reloc_root, which can
+undergo btrfs_drop_snapshot(), and since we don't have real refcount
+based protection to reloc roots, we can reach already dropped reloc
+root, triggering KASAN.
+
+[FIX]
+To avoid such access to unstable root->reloc_root, we should check
+BTRFS_ROOT_DEAD_RELOC_TREE bit first.
+
+This patch introduces wrappers that provide the correct way to check the
+bit with memory barriers protection.
+
+Most callers don't distinguish merged reloc tree and no reloc tree. The
+only exception is should_ignore_root(), as merged reloc tree can be
+ignored, while no reloc tree shouldn't.
+
+[CRITICAL SECTION ANALYSIS]
+Although test_bit()/set_bit()/clear_bit() doesn't imply a barrier, the
+DEAD_RELOC_TREE bit has extra help from transaction as a higher level
+barrier, the lifespan of root::reloc_root and DEAD_RELOC_TREE bit are:
+
+ NULL: reloc_root is NULL PTR: reloc_root is not NULL
+ 0: DEAD_RELOC_ROOT bit not set DEAD: DEAD_RELOC_ROOT bit set
+
+ (NULL, 0) Initial state __
+ | /\ Section A
+ btrfs_init_reloc_root() \/
+ | __
+ (PTR, 0) reloc_root initialized /\
+ | |
+ btrfs_update_reloc_root() | Section B
+ | |
+ (PTR, DEAD) reloc_root has been merged \/
+ | __
+ === btrfs_commit_transaction() ====================
+ | /\
+ clean_dirty_subvols() |
+ | | Section C
+ (NULL, DEAD) reloc_root cleanup starts \/
+ | __
+ btrfs_drop_snapshot() /\
+ | | Section D
+ (NULL, 0) Back to initial state \/
+
+Every have_reloc_root() or test_bit(DEAD_RELOC_ROOT) caller holds
+transaction handle, so none of such caller can cross transaction boundary.
+
+In Section A, every caller just found no DEAD bit, and grab reloc_root.
+
+In the cross section A-B, caller may get no DEAD bit, but since reloc_root
+is still completely valid thus accessing reloc_root is completely safe.
+
+No test_bit() caller can cross the boundary of Section B and Section C.
+
+In Section C, every caller found the DEAD bit, so no one will access
+reloc_root.
+
+In the cross section C-D, either caller gets the DEAD bit set, avoiding
+access reloc_root no matter if it's safe or not. Or caller get the DEAD
+bit cleared, then access reloc_root, which is already NULL, nothing will
+be wrong.
+
+The memory write barriers are between the reloc_root updates and bit
+set/clear, the pairing read side is before test_bit.
+
+Reported-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org>
+Fixes: d2311e698578 ("btrfs: relocation: Delay reloc tree deletion after merge_reloc_roots")
+CC: stable@vger.kernel.org # 5.4+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+[ barriers ]
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/relocation.c | 51 +++++++++++++++++++++++++++++++++++++++++++++-----
+ 1 file changed, 46 insertions(+), 5 deletions(-)
+
+--- a/fs/btrfs/relocation.c
++++ b/fs/btrfs/relocation.c
+@@ -517,6 +517,34 @@ static int update_backref_cache(struct b
+ return 1;
+ }
+
++static bool reloc_root_is_dead(struct btrfs_root *root)
++{
++ /*
++ * Pair with set_bit/clear_bit in clean_dirty_subvols and
++ * btrfs_update_reloc_root. We need to see the updated bit before
++ * trying to access reloc_root
++ */
++ smp_rmb();
++ if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
++ return true;
++ return false;
++}
++
++/*
++ * Check if this subvolume tree has valid reloc tree.
++ *
++ * Reloc tree after swap is considered dead, thus not considered as valid.
++ * This is enough for most callers, as they don't distinguish dead reloc root
++ * from no reloc root. But should_ignore_root() below is a special case.
++ */
++static bool have_reloc_root(struct btrfs_root *root)
++{
++ if (reloc_root_is_dead(root))
++ return false;
++ if (!root->reloc_root)
++ return false;
++ return true;
++}
+
+ static int should_ignore_root(struct btrfs_root *root)
+ {
+@@ -525,6 +553,10 @@ static int should_ignore_root(struct btr
+ if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ return 0;
+
++ /* This root has been merged with its reloc tree, we can ignore it */
++ if (reloc_root_is_dead(root))
++ return 1;
++
+ reloc_root = root->reloc_root;
+ if (!reloc_root)
+ return 0;
+@@ -1439,7 +1471,7 @@ int btrfs_init_reloc_root(struct btrfs_t
+ * The subvolume has reloc tree but the swap is finished, no need to
+ * create/update the dead reloc tree
+ */
+- if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
++ if (reloc_root_is_dead(root))
+ return 0;
+
+ if (root->reloc_root) {
+@@ -1478,8 +1510,7 @@ int btrfs_update_reloc_root(struct btrfs
+ struct btrfs_root_item *root_item;
+ int ret;
+
+- if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state) ||
+- !root->reloc_root)
++ if (!have_reloc_root(root))
+ goto out;
+
+ reloc_root = root->reloc_root;
+@@ -1489,6 +1520,11 @@ int btrfs_update_reloc_root(struct btrfs
+ if (fs_info->reloc_ctl->merge_reloc_tree &&
+ btrfs_root_refs(root_item) == 0) {
+ set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
++ /*
++ * Mark the tree as dead before we change reloc_root so
++ * have_reloc_root will not touch it from now on.
++ */
++ smp_wmb();
+ __del_reloc_root(reloc_root);
+ }
+
+@@ -2202,6 +2238,11 @@ static int clean_dirty_subvols(struct re
+ if (ret2 < 0 && !ret)
+ ret = ret2;
+ }
++ /*
++ * Need barrier to ensure clear_bit() only happens after
++ * root->reloc_root = NULL. Pairs with have_reloc_root.
++ */
++ smp_wmb();
+ clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
+ btrfs_put_fs_root(root);
+ } else {
+@@ -4721,7 +4762,7 @@ void btrfs_reloc_pre_snapshot(struct btr
+ struct btrfs_root *root = pending->root;
+ struct reloc_control *rc = root->fs_info->reloc_ctl;
+
+- if (!root->reloc_root || !rc)
++ if (!rc || !have_reloc_root(root))
+ return;
+
+ if (!rc->merge_reloc_tree)
+@@ -4755,7 +4796,7 @@ int btrfs_reloc_post_snapshot(struct btr
+ struct reloc_control *rc = root->fs_info->reloc_ctl;
+ int ret;
+
+- if (!root->reloc_root || !rc)
++ if (!rc || !have_reloc_root(root))
+ return 0;
+
+ rc = root->fs_info->reloc_ctl;
--- /dev/null
+From 8e57f8acbbd121ecfb0c9dc13b8b030f86c6bd3b Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Mon, 13 Jan 2020 16:29:20 -0800
+Subject: mm, debug_pagealloc: don't rely on static keys too early
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit 8e57f8acbbd121ecfb0c9dc13b8b030f86c6bd3b upstream.
+
+Commit 96a2b03f281d ("mm, debug_pagelloc: use static keys to enable
+debugging") has introduced a static key to reduce overhead when
+debug_pagealloc is compiled in but not enabled. It relied on the
+assumption that jump_label_init() is called before parse_early_param()
+as in start_kernel(), so when the "debug_pagealloc=on" option is parsed,
+it is safe to enable the static key.
+
+However, it turns out multiple architectures call parse_early_param()
+earlier from their setup_arch(). x86 also calls jump_label_init() even
+earlier, so no issue was found while testing the commit, but same is not
+true for e.g. ppc64 and s390 where the kernel would not boot with
+debug_pagealloc=on as found by our QA.
+
+To fix this without tricky changes to init code of multiple
+architectures, this patch partially reverts the static key conversion
+from 96a2b03f281d. Init-time and non-fastpath calls (such as in arch
+code) of debug_pagealloc_enabled() will again test a simple bool
+variable. Fastpath mm code is converted to a new
+debug_pagealloc_enabled_static() variant that relies on the static key,
+which is enabled in a well-defined point in mm_init() where it's
+guaranteed that jump_label_init() has been called, regardless of
+architecture.
+
+[sfr@canb.auug.org.au: export _debug_pagealloc_enabled_early]
+ Link: http://lkml.kernel.org/r/20200106164944.063ac07b@canb.auug.org.au
+Link: http://lkml.kernel.org/r/20191219130612.23171-1-vbabka@suse.cz
+Fixes: 96a2b03f281d ("mm, debug_pagelloc: use static keys to enable debugging")
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Borislav Petkov <bp@alien8.de>
+Cc: Qian Cai <cai@lca.pw>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/mm.h | 18 +++++++++++++++---
+ init/main.c | 1 +
+ mm/page_alloc.c | 37 +++++++++++++------------------------
+ mm/slab.c | 4 ++--
+ mm/slub.c | 2 +-
+ mm/vmalloc.c | 4 ++--
+ 6 files changed, 34 insertions(+), 32 deletions(-)
+
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -2666,14 +2666,26 @@ static inline bool want_init_on_free(voi
+ !page_poisoning_enabled();
+ }
+
+-#ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT
+-DECLARE_STATIC_KEY_TRUE(_debug_pagealloc_enabled);
++#ifdef CONFIG_DEBUG_PAGEALLOC
++extern void init_debug_pagealloc(void);
+ #else
+-DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
++static inline void init_debug_pagealloc(void) {}
+ #endif
++extern bool _debug_pagealloc_enabled_early;
++DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
+
+ static inline bool debug_pagealloc_enabled(void)
+ {
++ return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
++ _debug_pagealloc_enabled_early;
++}
++
++/*
++ * For use in fast paths after init_debug_pagealloc() has run, or when a
++ * false negative result is not harmful when called too early.
++ */
++static inline bool debug_pagealloc_enabled_static(void)
++{
+ if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC))
+ return false;
+
+--- a/init/main.c
++++ b/init/main.c
+@@ -553,6 +553,7 @@ static void __init mm_init(void)
+ * bigger than MAX_ORDER unless SPARSEMEM.
+ */
+ page_ext_init_flatmem();
++ init_debug_pagealloc();
+ report_meminit();
+ mem_init();
+ kmem_cache_init();
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -694,34 +694,27 @@ void prep_compound_page(struct page *pag
+ #ifdef CONFIG_DEBUG_PAGEALLOC
+ unsigned int _debug_guardpage_minorder;
+
+-#ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT
+-DEFINE_STATIC_KEY_TRUE(_debug_pagealloc_enabled);
+-#else
++bool _debug_pagealloc_enabled_early __read_mostly
++ = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
++EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
+ DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
+-#endif
+ EXPORT_SYMBOL(_debug_pagealloc_enabled);
+
+ DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
+
+ static int __init early_debug_pagealloc(char *buf)
+ {
+- bool enable = false;
+-
+- if (kstrtobool(buf, &enable))
+- return -EINVAL;
+-
+- if (enable)
+- static_branch_enable(&_debug_pagealloc_enabled);
+-
+- return 0;
++ return kstrtobool(buf, &_debug_pagealloc_enabled_early);
+ }
+ early_param("debug_pagealloc", early_debug_pagealloc);
+
+-static void init_debug_guardpage(void)
++void init_debug_pagealloc(void)
+ {
+ if (!debug_pagealloc_enabled())
+ return;
+
++ static_branch_enable(&_debug_pagealloc_enabled);
++
+ if (!debug_guardpage_minorder())
+ return;
+
+@@ -1186,7 +1179,7 @@ static __always_inline bool free_pages_p
+ */
+ arch_free_page(page, order);
+
+- if (debug_pagealloc_enabled())
++ if (debug_pagealloc_enabled_static())
+ kernel_map_pages(page, 1 << order, 0);
+
+ kasan_free_nondeferred_pages(page, order);
+@@ -1207,7 +1200,7 @@ static bool free_pcp_prepare(struct page
+
+ static bool bulkfree_pcp_prepare(struct page *page)
+ {
+- if (debug_pagealloc_enabled())
++ if (debug_pagealloc_enabled_static())
+ return free_pages_check(page);
+ else
+ return false;
+@@ -1221,7 +1214,7 @@ static bool bulkfree_pcp_prepare(struct
+ */
+ static bool free_pcp_prepare(struct page *page)
+ {
+- if (debug_pagealloc_enabled())
++ if (debug_pagealloc_enabled_static())
+ return free_pages_prepare(page, 0, true);
+ else
+ return free_pages_prepare(page, 0, false);
+@@ -1973,10 +1966,6 @@ void __init page_alloc_init_late(void)
+
+ for_each_populated_zone(zone)
+ set_zone_contiguous(zone);
+-
+-#ifdef CONFIG_DEBUG_PAGEALLOC
+- init_debug_guardpage();
+-#endif
+ }
+
+ #ifdef CONFIG_CMA
+@@ -2106,7 +2095,7 @@ static inline bool free_pages_prezeroed(
+ */
+ static inline bool check_pcp_refill(struct page *page)
+ {
+- if (debug_pagealloc_enabled())
++ if (debug_pagealloc_enabled_static())
+ return check_new_page(page);
+ else
+ return false;
+@@ -2128,7 +2117,7 @@ static inline bool check_pcp_refill(stru
+ }
+ static inline bool check_new_pcp(struct page *page)
+ {
+- if (debug_pagealloc_enabled())
++ if (debug_pagealloc_enabled_static())
+ return check_new_page(page);
+ else
+ return false;
+@@ -2155,7 +2144,7 @@ inline void post_alloc_hook(struct page
+ set_page_refcounted(page);
+
+ arch_alloc_page(page, order);
+- if (debug_pagealloc_enabled())
++ if (debug_pagealloc_enabled_static())
+ kernel_map_pages(page, 1 << order, 1);
+ kasan_alloc_pages(page, order);
+ kernel_poison_pages(page, 1 << order, 1);
+--- a/mm/slab.c
++++ b/mm/slab.c
+@@ -1415,7 +1415,7 @@ static void kmem_rcu_free(struct rcu_hea
+ #if DEBUG
+ static bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
+ {
+- if (debug_pagealloc_enabled() && OFF_SLAB(cachep) &&
++ if (debug_pagealloc_enabled_static() && OFF_SLAB(cachep) &&
+ (cachep->size % PAGE_SIZE) == 0)
+ return true;
+
+@@ -2007,7 +2007,7 @@ int __kmem_cache_create(struct kmem_cach
+ * to check size >= 256. It guarantees that all necessary small
+ * sized slab is initialized in current slab initialization sequence.
+ */
+- if (debug_pagealloc_enabled() && (flags & SLAB_POISON) &&
++ if (debug_pagealloc_enabled_static() && (flags & SLAB_POISON) &&
+ size >= 256 && cachep->object_size > cache_line_size()) {
+ if (size < PAGE_SIZE || size % PAGE_SIZE == 0) {
+ size_t tmp_size = ALIGN(size, PAGE_SIZE);
+--- a/mm/slub.c
++++ b/mm/slub.c
+@@ -290,7 +290,7 @@ static inline void *get_freepointer_safe
+ unsigned long freepointer_addr;
+ void *p;
+
+- if (!debug_pagealloc_enabled())
++ if (!debug_pagealloc_enabled_static())
+ return get_freepointer(s, object);
+
+ freepointer_addr = (unsigned long)object + s->offset;
+--- a/mm/vmalloc.c
++++ b/mm/vmalloc.c
+@@ -1349,7 +1349,7 @@ static void free_unmap_vmap_area(struct
+ {
+ flush_cache_vunmap(va->va_start, va->va_end);
+ unmap_vmap_area(va);
+- if (debug_pagealloc_enabled())
++ if (debug_pagealloc_enabled_static())
+ flush_tlb_kernel_range(va->va_start, va->va_end);
+
+ free_vmap_area_noflush(va);
+@@ -1647,7 +1647,7 @@ static void vb_free(const void *addr, un
+
+ vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
+
+- if (debug_pagealloc_enabled())
++ if (debug_pagealloc_enabled_static())
+ flush_tlb_kernel_range((unsigned long)addr,
+ (unsigned long)addr + size);
+
--- /dev/null
+From 97d3d0f9a1cf132c63c0b8b8bd497b8a56283dd9 Mon Sep 17 00:00:00 2001
+From: "Kirill A. Shutemov" <kirill@shutemov.name>
+Date: Mon, 13 Jan 2020 16:29:10 -0800
+Subject: mm/huge_memory.c: thp: fix conflict of above-47bit hint address and PMD alignment
+
+From: Kirill A. Shutemov <kirill@shutemov.name>
+
+commit 97d3d0f9a1cf132c63c0b8b8bd497b8a56283dd9 upstream.
+
+Patch series "Fix two above-47bit hint address vs. THP bugs".
+
+The two get_unmapped_area() implementations have to be fixed to provide
+THP-friendly mappings if above-47bit hint address is specified.
+
+This patch (of 2):
+
+Filesystems use thp_get_unmapped_area() to provide THP-friendly
+mappings. For DAX in particular.
+
+Normally, the kernel doesn't create userspace mappings above 47-bit,
+even if the machine allows this (such as with 5-level paging on x86-64).
+Not all user space is ready to handle wide addresses. It's known that
+at least some JIT compilers use higher bits in pointers to encode their
+information.
+
+Userspace can ask for allocation from full address space by specifying
+hint address (with or without MAP_FIXED) above 47-bits. If the
+application doesn't need a particular address, but wants to allocate
+from whole address space it can specify -1 as a hint address.
+
+Unfortunately, this trick breaks thp_get_unmapped_area(): the function
+would not try to allocate PMD-aligned area if *any* hint address
+specified.
+
+Modify the routine to handle it correctly:
+
+ - Try to allocate the space at the specified hint address with length
+ padding required for PMD alignment.
+ - If failed, retry without length padding (but with the same hint
+ address);
+ - If the returned address matches the hint address return it.
+ - Otherwise, align the address as required for THP and return.
+
+The user specified hint address is passed down to get_unmapped_area() so
+above-47bit hint address will be taken into account without breaking
+alignment requirements.
+
+Link: http://lkml.kernel.org/r/20191220142548.7118-2-kirill.shutemov@linux.intel.com
+Fixes: b569bab78d8d ("x86/mm: Prepare to expose larger address space to userspace")
+Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Reported-by: Thomas Willhalm <thomas.willhalm@intel.com>
+Tested-by: Dan Williams <dan.j.williams@intel.com>
+Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
+Cc: "Bruggeman, Otto G" <otto.g.bruggeman@intel.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/huge_memory.c | 38 ++++++++++++++++++++++++--------------
+ 1 file changed, 24 insertions(+), 14 deletions(-)
+
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -527,13 +527,13 @@ void prep_transhuge_page(struct page *pa
+ set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
+ }
+
+-static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
++static unsigned long __thp_get_unmapped_area(struct file *filp,
++ unsigned long addr, unsigned long len,
+ loff_t off, unsigned long flags, unsigned long size)
+ {
+- unsigned long addr;
+ loff_t off_end = off + len;
+ loff_t off_align = round_up(off, size);
+- unsigned long len_pad;
++ unsigned long len_pad, ret;
+
+ if (off_end <= off_align || (off_end - off_align) < size)
+ return 0;
+@@ -542,30 +542,40 @@ static unsigned long __thp_get_unmapped_
+ if (len_pad < len || (off + len_pad) < off)
+ return 0;
+
+- addr = current->mm->get_unmapped_area(filp, 0, len_pad,
++ ret = current->mm->get_unmapped_area(filp, addr, len_pad,
+ off >> PAGE_SHIFT, flags);
+- if (IS_ERR_VALUE(addr))
++
++ /*
++ * The failure might be due to length padding. The caller will retry
++ * without the padding.
++ */
++ if (IS_ERR_VALUE(ret))
+ return 0;
+
+- addr += (off - addr) & (size - 1);
+- return addr;
++ /*
++ * Do not try to align to THP boundary if allocation at the address
++ * hint succeeds.
++ */
++ if (ret == addr)
++ return addr;
++
++ ret += (off - ret) & (size - 1);
++ return ret;
+ }
+
+ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags)
+ {
++ unsigned long ret;
+ loff_t off = (loff_t)pgoff << PAGE_SHIFT;
+
+- if (addr)
+- goto out;
+ if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
+ goto out;
+
+- addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE);
+- if (addr)
+- return addr;
+-
+- out:
++ ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
++ if (ret)
++ return ret;
++out:
+ return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+ }
+ EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
--- /dev/null
+From 554913f600b45d73de12ad58c1ac7baa0f22a703 Mon Sep 17 00:00:00 2001
+From: Yang Shi <yang.shi@linux.alibaba.com>
+Date: Mon, 13 Jan 2020 16:29:36 -0800
+Subject: mm: khugepaged: add trace status description for SCAN_PAGE_HAS_PRIVATE
+
+From: Yang Shi <yang.shi@linux.alibaba.com>
+
+commit 554913f600b45d73de12ad58c1ac7baa0f22a703 upstream.
+
+Commit 99cb0dbd47a1 ("mm,thp: add read-only THP support for (non-shmem)
+FS") introduced a new khugepaged scan result: SCAN_PAGE_HAS_PRIVATE, but
+the corresponding description for trace events were not added.
+
+Link: http://lkml.kernel.org/r/1574793844-2914-1-git-send-email-yang.shi@linux.alibaba.com
+Fixes: 99cb0dbd47a1 ("mm,thp: add read-only THP support for (non-shmem) FS")
+Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
+Cc: Song Liu <songliubraving@fb.com>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Anshuman Khandual <anshuman.khandual@arm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/trace/events/huge_memory.h | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/include/trace/events/huge_memory.h
++++ b/include/trace/events/huge_memory.h
+@@ -31,7 +31,8 @@
+ EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \
+ EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \
+ EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \
+- EMe(SCAN_TRUNCATED, "truncated") \
++ EM( SCAN_TRUNCATED, "truncated") \
++ EMe(SCAN_PAGE_HAS_PRIVATE, "page_has_private") \
+
+ #undef EM
+ #undef EMe
--- /dev/null
+From 2fe20210fc5f5e62644678b8f927c49f2c6f42a7 Mon Sep 17 00:00:00 2001
+From: Adrian Huang <ahuang12@lenovo.com>
+Date: Mon, 13 Jan 2020 16:29:32 -0800
+Subject: mm: memcg/slab: call flush_memcg_workqueue() only if memcg workqueue is valid
+
+From: Adrian Huang <ahuang12@lenovo.com>
+
+commit 2fe20210fc5f5e62644678b8f927c49f2c6f42a7 upstream.
+
+When booting with amd_iommu=off, the following WARNING message
+appears:
+
+ AMD-Vi: AMD IOMMU disabled on kernel command-line
+ ------------[ cut here ]------------
+ WARNING: CPU: 0 PID: 0 at kernel/workqueue.c:2772 flush_workqueue+0x42e/0x450
+ Modules linked in:
+ CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.5.0-rc3-amd-iommu #6
+ Hardware name: Lenovo ThinkSystem SR655-2S/7D2WRCZ000, BIOS D8E101L-1.00 12/05/2019
+ RIP: 0010:flush_workqueue+0x42e/0x450
+ Code: ff 0f 0b e9 7a fd ff ff 4d 89 ef e9 33 fe ff ff 0f 0b e9 7f fd ff ff 0f 0b e9 bc fd ff ff 0f 0b e9 a8 fd ff ff e8 52 2c fe ff <0f> 0b 31 d2 48 c7 c6 e0 88 c5 95 48 c7 c7 d8 ad f0 95 e8 19 f5 04
+ Call Trace:
+ kmem_cache_destroy+0x69/0x260
+ iommu_go_to_state+0x40c/0x5ab
+ amd_iommu_prepare+0x16/0x2a
+ irq_remapping_prepare+0x36/0x5f
+ enable_IR_x2apic+0x21/0x172
+ default_setup_apic_routing+0x12/0x6f
+ apic_intr_mode_init+0x1a1/0x1f1
+ x86_late_time_init+0x17/0x1c
+ start_kernel+0x480/0x53f
+ secondary_startup_64+0xb6/0xc0
+ ---[ end trace 30894107c3749449 ]---
+ x2apic: IRQ remapping doesn't support X2APIC mode
+ x2apic disabled
+
+The warning is caused by the calling of 'kmem_cache_destroy()'
+in free_iommu_resources(). Here is the call path:
+
+ free_iommu_resources
+ kmem_cache_destroy
+ flush_memcg_workqueue
+ flush_workqueue
+
+The root cause is that the IOMMU subsystem runs before the workqueue
+subsystem, which the variable 'wq_online' is still 'false'. This leads
+to the statement 'if (WARN_ON(!wq_online))' in flush_workqueue() is
+'true'.
+
+Since the variable 'memcg_kmem_cache_wq' is not allocated during the
+time, it is unnecessary to call flush_memcg_workqueue(). This prevents
+the WARNING message triggered by flush_workqueue().
+
+Link: http://lkml.kernel.org/r/20200103085503.1665-1-ahuang12@lenovo.com
+Fixes: 92ee383f6daab ("mm: fix race between kmem_cache destroy, create and deactivate")
+Signed-off-by: Adrian Huang <ahuang12@lenovo.com>
+Reported-by: Xiaochun Lee <lixc17@lenovo.com>
+Reviewed-by: Shakeel Butt <shakeelb@google.com>
+Cc: Joerg Roedel <jroedel@suse.de>
+Cc: Christoph Lameter <cl@linux.com>
+Cc: Pekka Enberg <penberg@kernel.org>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/slab_common.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/mm/slab_common.c
++++ b/mm/slab_common.c
+@@ -903,7 +903,8 @@ static void flush_memcg_workqueue(struct
+ * deactivates the memcg kmem_caches through workqueue. Make sure all
+ * previous workitems on workqueue are processed.
+ */
+- flush_workqueue(memcg_kmem_cache_wq);
++ if (likely(memcg_kmem_cache_wq))
++ flush_workqueue(memcg_kmem_cache_wq);
+
+ /*
+ * If we're racing with children kmem_cache deactivation, it might
--- /dev/null
+From 4a87e2a25dc27131c3cce5e94421622193305638 Mon Sep 17 00:00:00 2001
+From: Roman Gushchin <guro@fb.com>
+Date: Mon, 13 Jan 2020 16:29:16 -0800
+Subject: mm: memcg/slab: fix percpu slab vmstats flushing
+
+From: Roman Gushchin <guro@fb.com>
+
+commit 4a87e2a25dc27131c3cce5e94421622193305638 upstream.
+
+Currently slab percpu vmstats are flushed twice: during the memcg
+offlining and just before freeing the memcg structure. Each time percpu
+counters are summed, added to the atomic counterparts and propagated up
+by the cgroup tree.
+
+The second flushing is required due to how recursive vmstats are
+implemented: counters are batched in percpu variables on a local level,
+and once a percpu value is crossing some predefined threshold, it spills
+over to atomic values on the local and each ascendant levels. It means
+that without flushing some numbers cached in percpu variables will be
+dropped on floor each time a cgroup is destroyed. And with uptime the
+error on upper levels might become noticeable.
+
+The first flushing aims to make counters on ancestor levels more
+precise. Dying cgroups may resume in the dying state for a long time.
+After kmem_cache reparenting which is performed during the offlining
+slab counters of the dying cgroup don't have any chances to be updated,
+because any slab operations will be performed on the parent level. It
+means that the inaccuracy caused by percpu batching will not decrease up
+to the final destruction of the cgroup. By the original idea flushing
+slab counters during the offlining should minimize the visible
+inaccuracy of slab counters on the parent level.
+
+The problem is that percpu counters are not zeroed after the first
+flushing. So every cached percpu value is summed twice. It creates a
+small error (up to 32 pages per cpu, but usually less) which accumulates
+on parent cgroup level. After creating and destroying of thousands of
+child cgroups, slab counter on parent level can be way off the real
+value.
+
+For now, let's just stop flushing slab counters on memcg offlining. It
+can't be done correctly without scheduling a work on each cpu: reading
+and zeroing it during css offlining can race with an asynchronous
+update, which doesn't expect values to be changed underneath.
+
+With this change, slab counters on parent level will become eventually
+consistent. Once all dying children are gone, values are correct. And
+if not, the error is capped by 32 * NR_CPUS pages per dying cgroup.
+
+It's not perfect, as slab are reparented, so any updates after the
+reparenting will happen on the parent level. It means that if a slab
+page was allocated, a counter on child level was bumped, then the page
+was reparented and freed, the annihilation of positive and negative
+counter values will not happen until the child cgroup is released. It
+makes slab counters different from others, and it might want us to
+implement flushing in a correct form again. But it's also a question of
+performance: scheduling a work on each cpu isn't free, and it's an open
+question if the benefit of having more accurate counters is worth it.
+
+We might also consider flushing all counters on offlining, not only slab
+counters.
+
+So let's fix the main problem now: make the slab counters eventually
+consistent, so at least the error won't grow with uptime (or more
+precisely the number of created and destroyed cgroups). And think about
+the accuracy of counters separately.
+
+Link: http://lkml.kernel.org/r/20191220042728.1045881-1-guro@fb.com
+Fixes: bee07b33db78 ("mm: memcontrol: flush percpu slab vmstats on kmem offlining")
+Signed-off-by: Roman Gushchin <guro@fb.com>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/mmzone.h | 5 ++---
+ mm/memcontrol.c | 37 +++++++++----------------------------
+ 2 files changed, 11 insertions(+), 31 deletions(-)
+
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -215,9 +215,8 @@ enum node_stat_item {
+ NR_INACTIVE_FILE, /* " " " " " */
+ NR_ACTIVE_FILE, /* " " " " " */
+ NR_UNEVICTABLE, /* " " " " " */
+- NR_SLAB_RECLAIMABLE, /* Please do not reorder this item */
+- NR_SLAB_UNRECLAIMABLE, /* and this one without looking at
+- * memcg_flush_percpu_vmstats() first. */
++ NR_SLAB_RECLAIMABLE,
++ NR_SLAB_UNRECLAIMABLE,
+ NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */
+ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */
+ WORKINGSET_NODES,
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -3404,49 +3404,34 @@ static u64 mem_cgroup_read_u64(struct cg
+ }
+ }
+
+-static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only)
++static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
+ {
+- unsigned long stat[MEMCG_NR_STAT];
++ unsigned long stat[MEMCG_NR_STAT] = {0};
+ struct mem_cgroup *mi;
+ int node, cpu, i;
+- int min_idx, max_idx;
+-
+- if (slab_only) {
+- min_idx = NR_SLAB_RECLAIMABLE;
+- max_idx = NR_SLAB_UNRECLAIMABLE;
+- } else {
+- min_idx = 0;
+- max_idx = MEMCG_NR_STAT;
+- }
+-
+- for (i = min_idx; i < max_idx; i++)
+- stat[i] = 0;
+
+ for_each_online_cpu(cpu)
+- for (i = min_idx; i < max_idx; i++)
++ for (i = 0; i < MEMCG_NR_STAT; i++)
+ stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
+
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+- for (i = min_idx; i < max_idx; i++)
++ for (i = 0; i < MEMCG_NR_STAT; i++)
+ atomic_long_add(stat[i], &mi->vmstats[i]);
+
+- if (!slab_only)
+- max_idx = NR_VM_NODE_STAT_ITEMS;
+-
+ for_each_node(node) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
+ struct mem_cgroup_per_node *pi;
+
+- for (i = min_idx; i < max_idx; i++)
++ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ stat[i] = 0;
+
+ for_each_online_cpu(cpu)
+- for (i = min_idx; i < max_idx; i++)
++ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ stat[i] += per_cpu(
+ pn->lruvec_stat_cpu->count[i], cpu);
+
+ for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
+- for (i = min_idx; i < max_idx; i++)
++ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ atomic_long_add(stat[i], &pi->lruvec_stat[i]);
+ }
+ }
+@@ -3520,13 +3505,9 @@ static void memcg_offline_kmem(struct me
+ parent = root_mem_cgroup;
+
+ /*
+- * Deactivate and reparent kmem_caches. Then flush percpu
+- * slab statistics to have precise values at the parent and
+- * all ancestor levels. It's required to keep slab stats
+- * accurate after the reparenting of kmem_caches.
++ * Deactivate and reparent kmem_caches.
+ */
+ memcg_deactivate_kmem_caches(memcg, parent);
+- memcg_flush_percpu_vmstats(memcg, true);
+
+ kmemcg_id = memcg->kmemcg_id;
+ BUG_ON(kmemcg_id < 0);
+@@ -5037,7 +5018,7 @@ static void mem_cgroup_free(struct mem_c
+ * Flush percpu vmstats and vmevents to guarantee the value correctness
+ * on parent's and all ancestor levels.
+ */
+- memcg_flush_percpu_vmstats(memcg, false);
++ memcg_flush_percpu_vmstats(memcg);
+ memcg_flush_percpu_vmevents(memcg);
+ __mem_cgroup_free(memcg);
+ }
--- /dev/null
+From 8068df3b60373c390198f660574ea14c8098de57 Mon Sep 17 00:00:00 2001
+From: David Hildenbrand <david@redhat.com>
+Date: Mon, 13 Jan 2020 16:29:07 -0800
+Subject: mm/memory_hotplug: don't free usage map when removing a re-added early section
+
+From: David Hildenbrand <david@redhat.com>
+
+commit 8068df3b60373c390198f660574ea14c8098de57 upstream.
+
+When we remove an early section, we don't free the usage map, as the
+usage maps of other sections are placed into the same page. Once the
+section is removed, it is no longer an early section (especially, the
+memmap is freed). When we re-add that section, the usage map is reused,
+however, it is no longer an early section. When removing that section
+again, we try to kfree() a usage map that was allocated during early
+boot - bad.
+
+Let's check against PageReserved() to see if we are dealing with an
+usage map that was allocated during boot. We could also check against
+!(PageSlab(usage_page) || PageCompound(usage_page)), but PageReserved() is
+cleaner.
+
+Can be triggered using memtrace under ppc64/powernv:
+
+ $ mount -t debugfs none /sys/kernel/debug/
+ $ echo 0x20000000 > /sys/kernel/debug/powerpc/memtrace/enable
+ $ echo 0x20000000 > /sys/kernel/debug/powerpc/memtrace/enable
+ ------------[ cut here ]------------
+ kernel BUG at mm/slub.c:3969!
+ Oops: Exception in kernel mode, sig: 5 [#1]
+ LE PAGE_SIZE=3D64K MMU=3DHash SMP NR_CPUS=3D2048 NUMA PowerNV
+ Modules linked in:
+ CPU: 0 PID: 154 Comm: sh Not tainted 5.5.0-rc2-next-20191216-00005-g0be1dba7b7c0 #61
+ NIP kfree+0x338/0x3b0
+ LR section_deactivate+0x138/0x200
+ Call Trace:
+ section_deactivate+0x138/0x200
+ __remove_pages+0x114/0x150
+ arch_remove_memory+0x3c/0x160
+ try_remove_memory+0x114/0x1a0
+ __remove_memory+0x20/0x40
+ memtrace_enable_set+0x254/0x850
+ simple_attr_write+0x138/0x160
+ full_proxy_write+0x8c/0x110
+ __vfs_write+0x38/0x70
+ vfs_write+0x11c/0x2a0
+ ksys_write+0x84/0x140
+ system_call+0x5c/0x68
+ ---[ end trace 4b053cbd84e0db62 ]---
+
+The first invocation will offline+remove memory blocks. The second
+invocation will first add+online them again, in order to offline+remove
+them again (usually we are lucky and the exact same memory blocks will
+get "reallocated").
+
+Tested on powernv with boot memory: The usage map will not get freed.
+Tested on x86-64 with DIMMs: The usage map will get freed.
+
+Using Dynamic Memory under a Power DLAPR can trigger it easily.
+
+Triggering removal (I assume after previously removed+re-added) of
+memory from the HMC GUI can crash the kernel with the same call trace
+and is fixed by this patch.
+
+Link: http://lkml.kernel.org/r/20191217104637.5509-1-david@redhat.com
+Fixes: 326e1b8f83a4 ("mm/sparsemem: introduce a SECTION_IS_EARLY flag")
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Tested-by: Pingfan Liu <piliu@redhat.com>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: Michal Hocko <mhocko@kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/sparse.c | 9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+--- a/mm/sparse.c
++++ b/mm/sparse.c
+@@ -775,7 +775,14 @@ static void section_deactivate(unsigned
+ if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) {
+ unsigned long section_nr = pfn_to_section_nr(pfn);
+
+- if (!section_is_early) {
++ /*
++ * When removing an early section, the usage map is kept (as the
++ * usage maps of other sections fall into the same page). It
++ * will be re-used when re-adding the section - which is then no
++ * longer an early section. If the usage map is PageReserved, it
++ * was allocated during boot.
++ */
++ if (!PageReserved(virt_to_page(ms->usage))) {
+ kfree(ms->usage);
+ ms->usage = NULL;
+ }
--- /dev/null
+From 6d9e8c651dd979aa666bee15f086745f3ea9c4b3 Mon Sep 17 00:00:00 2001
+From: Wen Yang <wenyang@linux.alibaba.com>
+Date: Mon, 13 Jan 2020 16:29:23 -0800
+Subject: mm/page-writeback.c: avoid potential division by zero in wb_min_max_ratio()
+
+From: Wen Yang <wenyang@linux.alibaba.com>
+
+commit 6d9e8c651dd979aa666bee15f086745f3ea9c4b3 upstream.
+
+Patch series "use div64_ul() instead of div_u64() if the divisor is
+unsigned long".
+
+We were first inspired by commit b0ab99e7736a ("sched: Fix possible divide
+by zero in avg_atom () calculation"), then refer to the recently analyzed
+mm code, we found this suspicious place.
+
+ 201 if (min) {
+ 202 min *= this_bw;
+ 203 do_div(min, tot_bw);
+ 204 }
+
+And we also disassembled and confirmed it:
+
+ /usr/src/debug/kernel-4.9.168-016.ali3000/linux-4.9.168-016.ali3000.alios7.x86_64/mm/page-writeback.c: 201
+ 0xffffffff811c37da <__wb_calc_thresh+234>: xor %r10d,%r10d
+ 0xffffffff811c37dd <__wb_calc_thresh+237>: test %rax,%rax
+ 0xffffffff811c37e0 <__wb_calc_thresh+240>: je 0xffffffff811c3800 <__wb_calc_thresh+272>
+ /usr/src/debug/kernel-4.9.168-016.ali3000/linux-4.9.168-016.ali3000.alios7.x86_64/mm/page-writeback.c: 202
+ 0xffffffff811c37e2 <__wb_calc_thresh+242>: imul %r8,%rax
+ /usr/src/debug/kernel-4.9.168-016.ali3000/linux-4.9.168-016.ali3000.alios7.x86_64/mm/page-writeback.c: 203
+ 0xffffffff811c37e6 <__wb_calc_thresh+246>: mov %r9d,%r10d ---> truncates it to 32 bits here
+ 0xffffffff811c37e9 <__wb_calc_thresh+249>: xor %edx,%edx
+ 0xffffffff811c37eb <__wb_calc_thresh+251>: div %r10
+ 0xffffffff811c37ee <__wb_calc_thresh+254>: imul %rbx,%rax
+ 0xffffffff811c37f2 <__wb_calc_thresh+258>: shr $0x2,%rax
+ 0xffffffff811c37f6 <__wb_calc_thresh+262>: mul %rcx
+ 0xffffffff811c37f9 <__wb_calc_thresh+265>: shr $0x2,%rdx
+ 0xffffffff811c37fd <__wb_calc_thresh+269>: mov %rdx,%r10
+
+This series uses div64_ul() instead of div_u64() if the divisor is
+unsigned long, to avoid truncation to 32-bit on 64-bit platforms.
+
+This patch (of 3):
+
+The variables 'min' and 'max' are unsigned long and do_div truncates
+them to 32 bits, which means it can test non-zero and be truncated to
+zero for division. Fix this issue by using div64_ul() instead.
+
+Link: http://lkml.kernel.org/r/20200102081442.8273-2-wenyang@linux.alibaba.com
+Fixes: 693108a8a667 ("writeback: make bdi->min/max_ratio handling cgroup writeback aware")
+Signed-off-by: Wen Yang <wenyang@linux.alibaba.com>
+Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
+Cc: Qian Cai <cai@lca.pw>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page-writeback.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -201,11 +201,11 @@ static void wb_min_max_ratio(struct bdi_
+ if (this_bw < tot_bw) {
+ if (min) {
+ min *= this_bw;
+- do_div(min, tot_bw);
++ min = div64_ul(min, tot_bw);
+ }
+ if (max < 100) {
+ max *= this_bw;
+- do_div(max, tot_bw);
++ max = div64_ul(max, tot_bw);
+ }
+ }
+
--- /dev/null
+From 991589974d9c9ecb24ee3799ec8c415c730598a2 Mon Sep 17 00:00:00 2001
+From: "Kirill A. Shutemov" <kirill@shutemov.name>
+Date: Mon, 13 Jan 2020 16:29:13 -0800
+Subject: mm/shmem.c: thp, shmem: fix conflict of above-47bit hint address and PMD alignment
+
+From: Kirill A. Shutemov <kirill@shutemov.name>
+
+commit 991589974d9c9ecb24ee3799ec8c415c730598a2 upstream.
+
+Shmem/tmpfs tries to provide THP-friendly mappings if huge pages are
+enabled. But it doesn't work well with above-47bit hint address.
+
+Normally, the kernel doesn't create userspace mappings above 47-bit,
+even if the machine allows this (such as with 5-level paging on x86-64).
+Not all user space is ready to handle wide addresses. It's known that
+at least some JIT compilers use higher bits in pointers to encode their
+information.
+
+Userspace can ask for allocation from full address space by specifying
+hint address (with or without MAP_FIXED) above 47-bits. If the
+application doesn't need a particular address, but wants to allocate
+from whole address space it can specify -1 as a hint address.
+
+Unfortunately, this trick breaks THP alignment in shmem/tmp:
+shmem_get_unmapped_area() would not try to allocate PMD-aligned area if
+*any* hint address specified.
+
+This can be fixed by requesting the aligned area if the we failed to
+allocated at user-specified hint address. The request with inflated
+length will also take the user-specified hint address. This way we will
+not lose an allocation request from the full address space.
+
+[kirill@shutemov.name: fold in a fixup]
+ Link: http://lkml.kernel.org/r/20191223231309.t6bh5hkbmokihpfu@box
+Link: http://lkml.kernel.org/r/20191220142548.7118-3-kirill.shutemov@linux.intel.com
+Fixes: b569bab78d8d ("x86/mm: Prepare to expose larger address space to userspace")
+Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: "Willhalm, Thomas" <thomas.willhalm@intel.com>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: "Bruggeman, Otto G" <otto.g.bruggeman@intel.com>
+Cc: "Aneesh Kumar K . V" <aneesh.kumar@linux.vnet.ibm.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/shmem.c | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+--- a/mm/shmem.c
++++ b/mm/shmem.c
+@@ -2106,9 +2106,10 @@ unsigned long shmem_get_unmapped_area(st
+ /*
+ * Our priority is to support MAP_SHARED mapped hugely;
+ * and support MAP_PRIVATE mapped hugely too, until it is COWed.
+- * But if caller specified an address hint, respect that as before.
++ * But if caller specified an address hint and we allocated area there
++ * successfully, respect that as before.
+ */
+- if (uaddr)
++ if (uaddr == addr)
+ return addr;
+
+ if (shmem_huge != SHMEM_HUGE_FORCE) {
+@@ -2142,7 +2143,7 @@ unsigned long shmem_get_unmapped_area(st
+ if (inflated_len < len)
+ return addr;
+
+- inflated_addr = get_area(NULL, 0, inflated_len, 0, flags);
++ inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags);
+ if (IS_ERR_VALUE(inflated_addr))
+ return addr;
+ if (inflated_addr & ~PAGE_MASK)
--- /dev/null
+From 1f26c0d3d24125992ab0026b0dab16c08df947c7 Mon Sep 17 00:00:00 2001
+From: Randy Dunlap <rdunlap@infradead.org>
+Date: Mon, 16 Dec 2019 18:52:45 -0800
+Subject: net: fix kernel-doc warning in <linux/netdevice.h>
+
+From: Randy Dunlap <rdunlap@infradead.org>
+
+commit 1f26c0d3d24125992ab0026b0dab16c08df947c7 upstream.
+
+Fix missing '*' kernel-doc notation that causes this warning:
+
+../include/linux/netdevice.h:1779: warning: bad line: spinlock
+
+Fixes: ab92d68fc22f ("net: core: add generic lockdep keys")
+Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
+Cc: Taehee Yoo <ap420073@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/netdevice.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/include/linux/netdevice.h
++++ b/include/linux/netdevice.h
+@@ -1761,7 +1761,7 @@ enum netdev_priv_flags {
+ * for hardware timestamping
+ * @sfp_bus: attached &struct sfp_bus structure.
+ * @qdisc_tx_busylock_key: lockdep class annotating Qdisc->busylock
+- spinlock
++ * spinlock
+ * @qdisc_running_key: lockdep class annotating Qdisc->running seqcount
+ * @qdisc_xmit_lock_key: lockdep class annotating
+ * netdev_queue->_xmit_lock spinlock
--- /dev/null
+From 8605131747e7e1fd8f6c9f97a00287aae2b2c640 Mon Sep 17 00:00:00 2001
+From: Jose Abreu <Jose.Abreu@synopsys.com>
+Date: Wed, 18 Dec 2019 11:17:41 +0100
+Subject: net: stmmac: 16KB buffer must be 16 byte aligned
+
+From: Jose Abreu <Jose.Abreu@synopsys.com>
+
+commit 8605131747e7e1fd8f6c9f97a00287aae2b2c640 upstream.
+
+The 16KB RX Buffer must also be 16 byte aligned. Fix it.
+
+Fixes: 7ac6653a085b ("stmmac: Move the STMicroelectronics driver")
+Signed-off-by: Jose Abreu <Jose.Abreu@synopsys.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/net/ethernet/stmicro/stmmac/common.h | 5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+--- a/drivers/net/ethernet/stmicro/stmmac/common.h
++++ b/drivers/net/ethernet/stmicro/stmmac/common.h
+@@ -364,9 +364,8 @@ struct dma_features {
+ unsigned int arpoffsel;
+ };
+
+-/* GMAC TX FIFO is 8K, Rx FIFO is 16K */
+-#define BUF_SIZE_16KiB 16384
+-/* RX Buffer size must be < 8191 and multiple of 4/8/16 bytes */
++/* RX Buffer size must be multiple of 4/8/16 bytes */
++#define BUF_SIZE_16KiB 16368
+ #define BUF_SIZE_8KiB 8188
+ #define BUF_SIZE_4KiB 4096
+ #define BUF_SIZE_2KiB 2048
--- /dev/null
+From b2f3a481c4cd62f78391b836b64c0a6e72b503d2 Mon Sep 17 00:00:00 2001
+From: Jose Abreu <Jose.Abreu@synopsys.com>
+Date: Wed, 18 Dec 2019 11:17:42 +0100
+Subject: net: stmmac: Enable 16KB buffer size
+
+From: Jose Abreu <Jose.Abreu@synopsys.com>
+
+commit b2f3a481c4cd62f78391b836b64c0a6e72b503d2 upstream.
+
+XGMAC supports maximum MTU that can go to 16KB. Lets add this check in
+the calculation of RX buffer size.
+
+Fixes: 7ac6653a085b ("stmmac: Move the STMicroelectronics driver")
+Signed-off-by: Jose Abreu <Jose.Abreu@synopsys.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
++++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+@@ -1108,7 +1108,9 @@ static int stmmac_set_bfsize(int mtu, in
+ {
+ int ret = bufsize;
+
+- if (mtu >= BUF_SIZE_4KiB)
++ if (mtu >= BUF_SIZE_8KiB)
++ ret = BUF_SIZE_16KiB;
++ else if (mtu >= BUF_SIZE_4KiB)
+ ret = BUF_SIZE_8KiB;
+ else if (mtu >= BUF_SIZE_2KiB)
+ ret = BUF_SIZE_4KiB;
--- /dev/null
+From 723c0011c7f6992f57e2c629fa9c89141acc115f Mon Sep 17 00:00:00 2001
+From: Geert Uytterhoeven <geert+renesas@glider.be>
+Date: Wed, 20 Nov 2019 15:26:13 +0100
+Subject: reset: Fix {of,devm}_reset_control_array_get kerneldoc return types
+
+From: Geert Uytterhoeven <geert+renesas@glider.be>
+
+commit 723c0011c7f6992f57e2c629fa9c89141acc115f upstream.
+
+of_reset_control_array_get() and devm_reset_control_array_get() return
+struct reset_control pointers, not internal struct reset_control_array
+pointers, just like all other reset control API calls.
+
+Correct the kerneldoc to match the code.
+
+Fixes: 17c82e206d2a3cd8 ("reset: Add APIs to manage array of resets")
+Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
+Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/reset/core.c | 6 ++----
+ 1 file changed, 2 insertions(+), 4 deletions(-)
+
+--- a/drivers/reset/core.c
++++ b/drivers/reset/core.c
+@@ -861,8 +861,7 @@ static int of_reset_control_get_count(st
+ * @acquired: only one reset control may be acquired for a given controller
+ * and ID
+ *
+- * Returns pointer to allocated reset_control_array on success or
+- * error on failure
++ * Returns pointer to allocated reset_control on success or error on failure
+ */
+ struct reset_control *
+ of_reset_control_array_get(struct device_node *np, bool shared, bool optional,
+@@ -915,8 +914,7 @@ EXPORT_SYMBOL_GPL(of_reset_control_array
+ * that just have to be asserted or deasserted, without any
+ * requirements on the order.
+ *
+- * Returns pointer to allocated reset_control_array on success or
+- * error on failure
++ * Returns pointer to allocated reset_control on success or error on failure
+ */
+ struct reset_control *
+ devm_reset_control_array_get(struct device *dev, bool shared, bool optional)
perf-hists-fix-variable-name-s-inconsistency-in-hists__for_each-macro.patch
locking-lockdep-fix-buffer-overrun-problem-in-stack_trace.patch
perf-report-fix-incorrectly-added-dimensions-as-switch-perf-data-file.patch
+mm-shmem.c-thp-shmem-fix-conflict-of-above-47bit-hint-address-and-pmd-alignment.patch
+mm-huge_memory.c-thp-fix-conflict-of-above-47bit-hint-address-and-pmd-alignment.patch
+mm-memcg-slab-fix-percpu-slab-vmstats-flushing.patch
+mm-memcg-slab-call-flush_memcg_workqueue-only-if-memcg-workqueue-is-valid.patch
+mm-debug_pagealloc-don-t-rely-on-static-keys-too-early.patch
+btrfs-fix-invalid-removal-of-root-ref.patch
+btrfs-do-not-delete-mismatched-root-refs.patch
+btrfs-relocation-fix-reloc_root-lifespan-and-access.patch
+btrfs-fix-memory-leak-in-qgroup-accounting.patch
+btrfs-check-rw_devices-not-num_devices-for-balance.patch
+btrfs-always-copy-scrub-arguments-back-to-user-space.patch
+mm-memory_hotplug-don-t-free-usage-map-when-removing-a-re-added-early-section.patch
+mm-page-writeback.c-avoid-potential-division-by-zero-in-wb_min_max_ratio.patch
+mm-khugepaged-add-trace-status-description-for-scan_page_has_private.patch
+arm-dts-imx6qdl-sabresd-remove-incorrect-power-supply-assignment.patch
+arm-dts-imx6sx-sdb-remove-incorrect-power-supply-assignment.patch
+arm-dts-imx6sl-evk-remove-incorrect-power-supply-assignment.patch
+arm-dts-imx6sll-evk-remove-incorrect-power-supply-assignment.patch
+arm-dts-imx6q-icore-mipi-use-1.5-version-of-i.core-mx6dl.patch
+arm-dts-imx7-fix-toradex-colibri-imx7s-256mb-nand-flash-support.patch
+net-stmmac-16kb-buffer-must-be-16-byte-aligned.patch
+net-stmmac-enable-16kb-buffer-size.patch
+reset-fix-of-devm-_reset_control_array_get-kerneldoc-return-types.patch
+tipc-fix-potential-hanging-after-b-rcast-changing.patch
+tipc-fix-retrans-failure-due-to-wrong-destination.patch
+net-fix-kernel-doc-warning-in-linux-netdevice.h.patch
+block-fix-the-type-of-sts-in-bsg_queue_rq.patch
--- /dev/null
+From dca4a17d24ee9d878836ce5eb8dc25be1ffa5729 Mon Sep 17 00:00:00 2001
+From: Tuong Lien <tuong.t.lien@dektech.com.au>
+Date: Tue, 10 Dec 2019 15:21:03 +0700
+Subject: tipc: fix potential hanging after b/rcast changing
+
+From: Tuong Lien <tuong.t.lien@dektech.com.au>
+
+commit dca4a17d24ee9d878836ce5eb8dc25be1ffa5729 upstream.
+
+In commit c55c8edafa91 ("tipc: smooth change between replicast and
+broadcast"), we allow instant switching between replicast and broadcast
+by sending a dummy 'SYN' packet on the last used link to synchronize
+packets on the links. The 'SYN' message is an object of link congestion
+also, so if that happens, a 'SOCK_WAKEUP' will be scheduled to be sent
+back to the socket...
+However, in that commit, we simply use the same socket 'cong_link_cnt'
+counter for both the 'SYN' & normal payload message sending. Therefore,
+if both the replicast & broadcast links are congested, the counter will
+be not updated correctly but overwritten by the latter congestion.
+Later on, when the 'SOCK_WAKEUP' messages are processed, the counter is
+reduced one by one and eventually overflowed. Consequently, further
+activities on the socket will only wait for the false congestion signal
+to disappear but never been met.
+
+Because sending the 'SYN' message is vital for the mechanism, it should
+be done anyway. This commit fixes the issue by marking the message with
+an error code e.g. 'TIPC_ERR_NO_PORT', so its sending should not face a
+link congestion, there is no need to touch the socket 'cong_link_cnt'
+either. In addition, in the event of any error (e.g. -ENOBUFS), we will
+purge the entire payload message queue and make a return immediately.
+
+Fixes: c55c8edafa91 ("tipc: smooth change between replicast and broadcast")
+Acked-by: Jon Maloy <jon.maloy@ericsson.com>
+Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/tipc/bcast.c | 24 +++++++++++++++---------
+ 1 file changed, 15 insertions(+), 9 deletions(-)
+
+--- a/net/tipc/bcast.c
++++ b/net/tipc/bcast.c
+@@ -305,17 +305,17 @@ static int tipc_rcast_xmit(struct net *n
+ * @skb: socket buffer to copy
+ * @method: send method to be used
+ * @dests: destination nodes for message.
+- * @cong_link_cnt: returns number of encountered congested destination links
+ * Returns 0 if success, otherwise errno
+ */
+ static int tipc_mcast_send_sync(struct net *net, struct sk_buff *skb,
+ struct tipc_mc_method *method,
+- struct tipc_nlist *dests,
+- u16 *cong_link_cnt)
++ struct tipc_nlist *dests)
+ {
+ struct tipc_msg *hdr, *_hdr;
+ struct sk_buff_head tmpq;
+ struct sk_buff *_skb;
++ u16 cong_link_cnt;
++ int rc = 0;
+
+ /* Is a cluster supporting with new capabilities ? */
+ if (!(tipc_net(net)->capabilities & TIPC_MCAST_RBCTL))
+@@ -343,18 +343,19 @@ static int tipc_mcast_send_sync(struct n
+ _hdr = buf_msg(_skb);
+ msg_set_size(_hdr, MCAST_H_SIZE);
+ msg_set_is_rcast(_hdr, !msg_is_rcast(hdr));
++ msg_set_errcode(_hdr, TIPC_ERR_NO_PORT);
+
+ __skb_queue_head_init(&tmpq);
+ __skb_queue_tail(&tmpq, _skb);
+ if (method->rcast)
+- tipc_bcast_xmit(net, &tmpq, cong_link_cnt);
++ rc = tipc_bcast_xmit(net, &tmpq, &cong_link_cnt);
+ else
+- tipc_rcast_xmit(net, &tmpq, dests, cong_link_cnt);
++ rc = tipc_rcast_xmit(net, &tmpq, dests, &cong_link_cnt);
+
+ /* This queue should normally be empty by now */
+ __skb_queue_purge(&tmpq);
+
+- return 0;
++ return rc;
+ }
+
+ /* tipc_mcast_xmit - deliver message to indicated destination nodes
+@@ -396,9 +397,14 @@ int tipc_mcast_xmit(struct net *net, str
+ msg_set_is_rcast(hdr, method->rcast);
+
+ /* Switch method ? */
+- if (rcast != method->rcast)
+- tipc_mcast_send_sync(net, skb, method,
+- dests, cong_link_cnt);
++ if (rcast != method->rcast) {
++ rc = tipc_mcast_send_sync(net, skb, method, dests);
++ if (unlikely(rc)) {
++ pr_err("Unable to send SYN: method %d, rc %d\n",
++ rcast, rc);
++ goto exit;
++ }
++ }
+
+ if (method->rcast)
+ rc = tipc_rcast_xmit(net, pkts, dests, cong_link_cnt);
--- /dev/null
+From abc9b4e0549b93fdaff56e9532bc49a2d7b04955 Mon Sep 17 00:00:00 2001
+From: Tuong Lien <tuong.t.lien@dektech.com.au>
+Date: Tue, 10 Dec 2019 15:21:04 +0700
+Subject: tipc: fix retrans failure due to wrong destination
+
+From: Tuong Lien <tuong.t.lien@dektech.com.au>
+
+commit abc9b4e0549b93fdaff56e9532bc49a2d7b04955 upstream.
+
+When a user message is sent, TIPC will check if the socket has faced a
+congestion at link layer. If that happens, it will make a sleep to wait
+for the congestion to disappear. This leaves a gap for other users to
+take over the socket (e.g. multi threads) since the socket is released
+as well. Also, in case of connectionless (e.g. SOCK_RDM), user is free
+to send messages to various destinations (e.g. via 'sendto()'), then
+the socket's preformatted header has to be updated correspondingly
+prior to the actual payload message building.
+
+Unfortunately, the latter action is done before the first action which
+causes a condition issue that the destination of a certain message can
+be modified incorrectly in the middle, leading to wrong destination
+when that message is built. Consequently, when the message is sent to
+the link layer, it gets stuck there forever because the peer node will
+simply reject it. After a number of retransmission attempts, the link
+is eventually taken down and the retransmission failure is reported.
+
+This commit fixes the problem by rearranging the order of actions to
+prevent the race condition from occurring, so the message building is
+'atomic' and its header will not be modified by anyone.
+
+Fixes: 365ad353c256 ("tipc: reduce risk of user starvation during link congestion")
+Acked-by: Jon Maloy <jon.maloy@ericsson.com>
+Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/tipc/socket.c | 32 ++++++++++++++++++--------------
+ 1 file changed, 18 insertions(+), 14 deletions(-)
+
+--- a/net/tipc/socket.c
++++ b/net/tipc/socket.c
+@@ -1306,8 +1306,8 @@ static int __tipc_sendmsg(struct socket
+ struct tipc_msg *hdr = &tsk->phdr;
+ struct tipc_name_seq *seq;
+ struct sk_buff_head pkts;
+- u32 dport, dnode = 0;
+- u32 type, inst;
++ u32 dport = 0, dnode = 0;
++ u32 type = 0, inst = 0;
+ int mtu, rc;
+
+ if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE))
+@@ -1360,23 +1360,11 @@ static int __tipc_sendmsg(struct socket
+ type = dest->addr.name.name.type;
+ inst = dest->addr.name.name.instance;
+ dnode = dest->addr.name.domain;
+- msg_set_type(hdr, TIPC_NAMED_MSG);
+- msg_set_hdr_sz(hdr, NAMED_H_SIZE);
+- msg_set_nametype(hdr, type);
+- msg_set_nameinst(hdr, inst);
+- msg_set_lookup_scope(hdr, tipc_node2scope(dnode));
+ dport = tipc_nametbl_translate(net, type, inst, &dnode);
+- msg_set_destnode(hdr, dnode);
+- msg_set_destport(hdr, dport);
+ if (unlikely(!dport && !dnode))
+ return -EHOSTUNREACH;
+ } else if (dest->addrtype == TIPC_ADDR_ID) {
+ dnode = dest->addr.id.node;
+- msg_set_type(hdr, TIPC_DIRECT_MSG);
+- msg_set_lookup_scope(hdr, 0);
+- msg_set_destnode(hdr, dnode);
+- msg_set_destport(hdr, dest->addr.id.ref);
+- msg_set_hdr_sz(hdr, BASIC_H_SIZE);
+ } else {
+ return -EINVAL;
+ }
+@@ -1387,6 +1375,22 @@ static int __tipc_sendmsg(struct socket
+ if (unlikely(rc))
+ return rc;
+
++ if (dest->addrtype == TIPC_ADDR_NAME) {
++ msg_set_type(hdr, TIPC_NAMED_MSG);
++ msg_set_hdr_sz(hdr, NAMED_H_SIZE);
++ msg_set_nametype(hdr, type);
++ msg_set_nameinst(hdr, inst);
++ msg_set_lookup_scope(hdr, tipc_node2scope(dnode));
++ msg_set_destnode(hdr, dnode);
++ msg_set_destport(hdr, dport);
++ } else { /* TIPC_ADDR_ID */
++ msg_set_type(hdr, TIPC_DIRECT_MSG);
++ msg_set_lookup_scope(hdr, 0);
++ msg_set_destnode(hdr, dnode);
++ msg_set_destport(hdr, dest->addr.id.ref);
++ msg_set_hdr_sz(hdr, BASIC_H_SIZE);
++ }
++
+ __skb_queue_head_init(&pkts);
+ mtu = tipc_node_get_mtu(net, dnode, tsk->portid);
+ rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts);